1 Aim of the analysis

Aim of the analysis is to first identify processing sites (PSS) and transcriptional start sites (TSS). Using the set of PSS, processing sites dependent on RNase E will be identified by comparing which of those sites are enriched or depleted in a thermo-sensitive strain rne(Ts) (also called “dIF” in some subsequent analyses) compared to a wild-typic strain rne(WT) (also called “dWT” in some subsequent analyses) after 1 hour growth at 39°C (compare Chao et al. (2017) and Förstner et al. (2018)).

The sets of PSS and TSS will be compared to data by Kopf et al. (2014) (in the following: “anno-TSS” and “TUs”). PSS dependent on RNase E will be analyzed for nucleotide composition, secondary structures, functional enrichment etc.

#import gff for anno-TSS
anno_TSS <- unique(rtracklayer::import("input/Kopf_TSS.gff")) # 14 int-TSS are duplicates and are also annotated as alt-TSS
features <- rtracklayer::import("input/20210217_syne_onlyUnique_withFeat.gff3")
TUs <- rtracklayer::import("input/Kopf_4091_TUs_combined.gff3")

2 Identification and characterization of TSS and PSS sites

2.1 Identification of TSS and PSS using edgeR and DESeq2

Input are tables of PSS and TSS 5’ end counts prepared using a workflow on usegalaxy.eu and a python script. Sequencing data was created from libraries prepared similar to the protocol described in Innocenti et al. (2015) (tagRNA-Seq).

PSS_raw <- read.delim("input/PSS_5ends_combined.txt", header=TRUE, row.names=1)
names(PSS_raw) <- paste(names(PSS_raw), "_PSS", sep="")
TSS_raw <- read.delim("input/TSS_5ends_combined.txt", header=TRUE, row.names=1)
names(TSS_raw) <- paste(names(TSS_raw), "_TSS", sep="")

# merge PSS and TSS to merged_raw
merged_raw <- merge(PSS_raw, TSS_raw, by="row.names")
row.names(merged_raw) <- merged_raw$Row.names
merged_raw$Row.names <- NULL

When taking into account lowly populated sites, DESeq2 dispersion estimates become over-fitted. Hence, edgeR::filterByExpression() is used as a pre-processing step before further DESeq2 analyses.

group=c(rep(c("dIF_0h_PSS", "dIF_1h_PSS"), 3), rep(c("dWT_0h_PSS", "dWT_1h_PSS"), 3), rep(c("dIF_0h_TSS", "dIF_1h_TSS"), 3), rep(c("dWT_0h_TSS", "dWT_1h_TSS"), 3))
y <- DGEList(counts=merged_raw, group=group)
nrow(y)
## [1] 7894038
keep <- filterByExpr(y)
y <- y[keep, ,keep.lib.size=FALSE] #reduces from 7,894,038 positions to 11,265
nrow(y)
## [1] 11265
merged_filtered <- merged_raw[row.names(y$counts),]
nrow(merged_filtered)
## [1] 11265
rm(merged_raw)

Since TSS libraries are generally approximately 1.6 times larger than PSS libraries, size factors are determined separately for the different data set types. Size factors for TSS and PSS correlate quite well for different samples.

coldata_PSS <- read.csv("input/20210216_colData_PSS.csv", row.names=1)
coldata_TSS <- read.csv("input/20210216_colData_TSS.csv", row.names=1)

# create DESeq2 data object
ddsMat_PSS <- DESeqDataSetFromMatrix(countData = merged_filtered[,1:12],
                                 colData = coldata_PSS,
                                 design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_TSS <- DESeqDataSetFromMatrix(countData = merged_filtered[,13:24],
                                 colData = coldata_TSS,
                                 design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# run DESeq
ddsMat_PSS <- estimateSizeFactors(ddsMat_PSS) 
ddsMat_TSS <- estimateSizeFactors(ddsMat_TSS) 

# factors needed for creation of .grp file
write.csv(data.frame(factor=ddsMat_PSS$sizeFactor), file="output/PSS_sizeFactors.csv")
write.csv(data.frame(factor=ddsMat_TSS$sizeFactor), file="output/TSS_sizeFactors.csv")

sizeFact <- c(ddsMat_PSS$sizeFactor, ddsMat_TSS$sizeFactor)
plot(sizeFact[1:12], sizeFact[13:24], xlab="Size Factors PSS", ylab="Size Factors TSS")

If .grp were not yet created, create .grp files by setting if() to TRUE (! takes some time !):

if(FALSE){
ddsMat_PSS_2 <- DESeqDataSetFromMatrix(countData = PSS_raw,
                                 colData = coldata_PSS,
                                 design = ~ condition)
ddsMat_PSS_2$sizeFactor <- ddsMat_PSS$sizeFactor

PSS_norm_counts <- counts(ddsMat_PSS_2, normalized=TRUE)
rm(ddsMat_PSS_2)

grp_File <- data.frame("rneTs_0h"=apply(PSS_norm_counts[,c("dIF11_0h_PSS", "dIF12_0h_PSS", "dIF31_0h_PSS")],1,mean), "rneTs_1h"=apply(PSS_norm_counts[,c("dIF11_1h_PSS", "dIF12_1h_PSS", "dIF31_1h_PSS")],1,mean), "rneWT_0h"=apply(PSS_norm_counts[,c("dWT1_0h_PSS", "dWT2_0h_PSS", "dWT3_0h_PSS")],1,mean), "rneWT_1h"=apply(PSS_norm_counts[,c("dWT1_1h_PSS", "dWT2_1h_PSS", "dWT3_1h_PSS")],1,mean))
rm(PSS_norm_counts)
 
grp_File$seqname <- rep(NA, length(grp_File$rneTs_0h))
seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
for(i in seqnames){
  grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
}
grp_File$strand <- rep(NA, length(grp_File$rneTs_0h))
for(i in c("plus", "minus")){
  grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
}

for(i in seqnames){
  for(j in c("plus", "minus")){
    tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("rneTs_0h", "rneTs_1h", "rneWT_0h", "rneWT_1h")]
    s=""
    if(j=="plus"){
      s="fw"
    }else{
      s="rev"
   }
    write.table(tmp, file=paste("output/grp/PSS/", i, "_PSS_Ts-0h-1h-WT-0h-1h_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
  }
}
  
}
rm(PSS_raw)
if(FALSE){
ddsMat_TSS_2 <- DESeqDataSetFromMatrix(countData = TSS_raw,
                                 colData = coldata_TSS,
                                 design = ~ condition)
ddsMat_TSS_2$sizeFactor <- ddsMat_TSS$sizeFactor

TSS_norm_counts <- counts(ddsMat_TSS_2, normalized=TRUE)
rm(ddsMat_TSS_2)

grp_File <- data.frame("rneTs_0h"=apply(TSS_norm_counts[,c("dIF11_0h_TSS", "dIF12_0h_TSS", "dIF31_0h_TSS")],1,mean), "rneTs_1h"=apply(TSS_norm_counts[,c("dIF11_1h_TSS", "dIF12_1h_TSS", "dIF31_1h_TSS")],1,mean), "rneWT_0h"=apply(TSS_norm_counts[,c("dWT1_0h_TSS", "dWT2_0h_TSS", "dWT3_0h_TSS")],1,mean), "rneWT_1h"=apply(TSS_norm_counts[,c("dWT1_1h_TSS", "dWT2_1h_TSS", "dWT3_1h_TSS")],1,mean))
rm(TSS_norm_counts)
 
grp_File$seqname <- rep(NA, length(grp_File$rneTs_0h))
seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
for(i in seqnames){
  grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
}
grp_File$strand <- rep(NA, length(grp_File$rneTs_0h))
for(i in c("plus", "minus")){
  grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
}

for(i in seqnames){
  for(j in c("plus", "minus")){
    tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("rneTs_0h", "rneTs_1h", "rneWT_0h", "rneWT_1h")]
    s=""
    if(j=="plus"){
      s="fw"
    }else{
      s="rev"
   }
    write.table(tmp, file=paste("output/grp/TSS/", i, "_TSS_Ts-0h-1h-WT-0h-1h_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
  }
}
  
}
rm(TSS_raw)

In a subsequent step, DESeq2 is used to determine which positions are enriched in the TSS and PSS libraries, respectively. Size factors determined above are used.

coldata <- read.csv("input/20210210_colData_PSS-TSS.csv", row.names=1)

# create DESeq2 data object
ddsMat_PSSTSS <- DESeqDataSetFromMatrix(countData = merged_filtered,
                                 colData = coldata,
                                 design = ~ condition + type)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_PSSTSS$sizeFactor <- sizeFact
ddsMat_PSSTSS <- DESeq(ddsMat_PSSTSS)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_PSSTSS <- results(ddsMat_PSSTSS, contrast=c("type", "PSS", "TSS"))
write.csv(res_PSSTSS[order(res_PSSTSS$padj),], file="output/DESeq2_resultsTables/results_PSS-TSS-copmarisons.csv")

As a next stop, a first unfiltered set of potential PSS and TSS positions is created:

PSS_positions_unfiltered <- row.names(subset(res_PSSTSS, res_PSSTSS$log2FoldChange>0.8 & res_PSSTSS$padj<0.05))
TSS_positions_unfiltered <- row.names(subset(res_PSSTSS, res_PSSTSS$log2FoldChange<(-0.8) & res_PSSTSS$padj<0.05))

Create GRanges object for PSS data:

PSS_nonReduced_Ranges <- create_GRanges_object(base::strsplit(PSS_positions_unfiltered, "-"), res_PSSTSS[PSS_positions_unfiltered,]$baseMean)
PSS_nonReduced_Ranges$name <- PSS_positions_unfiltered
PSS_nonReduced_Ranges$type <- "PSS"

2.1.1 Diagnostic Plots

pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_PSSTSS, xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png 
##   2
plotDispEsts(ddsMat_PSSTSS)

PSS and TSS data sets are separated on principal component 1, which captures 83% of the variance. Principal component 2 separates before/after heat treatment.

p <- PCA_plot_PSSTSS(ddsMat_PSSTSS, "PSS-TSS")
p

ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_PCA.pdf", plot=p, width=15, height=9, units="cm")

The same holds true when observing the respective heat plot.

p <- heatmap_plot_PSSTSS(ddsMat_PSSTSS, "PSS-TSS")
p

ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")

Comparing PSS and TSS using DESeq2 yields a bimodal log2foldchange distribution. This underlines that PSS and TSS are quite well discernible using the chosen approach. This log2FC distribution is also observable in the non-normalized count data. This is further corroborated by the MAplot and also regarding p-values etc. (see below: Volcano plot and p-value-plot).

pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_hist_log2FC.pdf", width=4.5, height=4.5)
hist(res_PSSTSS$log2FoldChange, breaks=20, main="", xlab="Log2FC(PSS/TSS)", ylab="Frequency")
dev.off()
## png 
##   2
hist(res_PSSTSS$log2FoldChange, breaks=20, main="DESeq2 Normalization", xlab="Log2FC PSS/TSS", ylab="Frequency")

# non-normalized
log2_all <- log2(apply(merged_filtered[,1:12],1, mean)/apply(merged_filtered[,13:24],1,mean))
hist(log2_all, breaks=20, main="No normalization", xlab="Log2(PSS/TSS)")

p <- MAplot_ggplot(res_PSSTSS, foldchange=0.8, y_axis_label = "Log2 fold-change(PSS/TSS)")
p <- p + scale_colour_manual(values=c("UP"="#4e9879ff", "DOWN"="#9b511fff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_MAplot.pdf",plot=p, width=15, height=12, units="cm")
res_PSSTSS <- subset(res_PSSTSS, !is.na(res_PSSTSS$padj))

# down: TSS, up: PSS
count_up_down(res_PSSTSS, foldchange=0.8, padjusted=0.05)
## [1] "number of features down:  7273"
## [1] "number of features up:  3450"
p <- volcanoPlot_ggplot(res_PSSTSS, foldchange=0.8, padjusted=0.05) + scale_colour_manual(values=c("UP"="#4e9879ff", "DOWN"="#9b511fff", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p

ggsave("output/DESeq2_Plots/ddsMat_PSSTSS_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

pdf(file="output/DESeq2_Plots/ddsMat_PSSTSS_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(res_PSSTSS, "Comparison PSS to TSS")
dev.off()
## png 
##   2
pvaluePlot(res_PSSTSS, "Comparison PSS to TSS")

2.1.2 Filter potential TSS-positions for positions with (TSS-reads)/(transcript-coverage)>0.02 and <2.0

By inspecting the identified TSS, we observed that a high number of false-positive TSS positions are present in highly expressed genes. To control for this, we control for TSS with a ratio (number TSS-reads)/(number transcript reads) above a certain threshold. To determine this cut-off, we manually inspected highly expressed genomic positions such as the 5’ UTR of psbA2, ncr0700, ssrA and CRISPR3. For instance for psbA2, the actual TSS are well established by independent experiments (Sakurai et al. (2012)). 2% seemed to be a good cut-off for this control.

Further, we introduced a cut-off to filter against TSS without associated transcription. The used library preparation protocol does not capture sRNAs with a length below 200nt and TSS without accompanying transcription might be TSS of sRNAs. However, we assume a high percentage of TSS which were identified without associated transcription are false positives, e.g. created by reverse transcription artefacts. An example is the second TSS upstream of PsbA2R. The actual TSS of the transcript was determined in Sakurai et al. (2012). We decided to filter out TSS for which the ratio (number TSS-reads)/(number transcript coverage) > 200%.

First, transcript coverage data has to be read in and normalized:

transcript_coverage <- read.delim("input/transcript_coverage_combined.txt", header=TRUE, row.names=1)

group=c(rep(c("dIF_0h", "dIF_1h"), 3), rep(c("dWT_0h", "dWT_1h"), 3))
y <- DGEList(counts=transcript_coverage, group=group)
nrow(y)
## [1] 7894038
keep <- filterByExpr(y)
y <- y[keep, ,keep.lib.size=FALSE] #reduces from 7,894,038 positions to 3,839,810
nrow(y)
## [1] 3839810
transcript_coverage_filt <- transcript_coverage[row.names(y$counts),]

coldata <- read.csv("input/colData_transcript.csv", row.names=1)

# create DESeq2 data object
ddsMat_transc <- DESeqDataSetFromMatrix(countData = transcript_coverage_filt,
                                 colData = coldata,
                                 design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_transc <- estimateSizeFactors(ddsMat_transc)
trans_norm <- as.data.frame(counts(ddsMat_transc, normalized=TRUE))
TSS_norm <- as.data.frame(counts(ddsMat_TSS, normalized=TRUE))
PSS_norm <- as.data.frame(counts(ddsMat_PSS, normalized=TRUE))

Create transcript .grp files:

if(FALSE){
ddsMat_transc_2 <- DESeqDataSetFromMatrix(countData = transcript_coverage,
                                 colData = coldata,
                                 design = ~ condition)
ddsMat_transc_2$sizeFactor <- ddsMat_transc$sizeFactor

transc_norm_counts <- counts(ddsMat_transc_2, normalized=TRUE)
rm(ddsMat_transc_2)

grp_File <- data.frame("rneTs_0h"=apply(transc_norm_counts[,c("dIF11_0h", "dIF12_0h", "dIF31_0h")],1,mean), "rneTs_1h"=apply(transc_norm_counts[,c("dIF11_1h", "dIF12_1h", "dIF31_1h")],1,mean), "rneWT_0h"=apply(transc_norm_counts[,c("dWT1_0h", "dWT2_0h", "dWT3_0h")],1,mean), "rneWT_1h"=apply(transc_norm_counts[,c("dWT1_1h", "dWT2_1h", "dWT3_1h")],1,mean))
rm(transc_norm_counts)
 
grp_File$seqname <- rep(NA, length(grp_File$rneTs_0h))
seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
for(i in seqnames){
  grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
}
grp_File$strand <- rep(NA, length(grp_File$rneTs_0h))
for(i in c("plus", "minus")){
  grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
}

for(i in seqnames){
  for(j in c("plus", "minus")){
    tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("rneTs_0h", "rneTs_1h", "rneWT_0h", "rneWT_1h")]
    s=""
    if(j=="plus"){
      s="fw"
    }else{
      s="rev"
   }
    write.table(tmp, file=paste("output/grp/transcript/", i, "_transcript_Ts-0h-1h-WT-0h-1h_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
  }
}
  
}
rm(transcript_coverage) # remove to make space 
rm(ddsMat_transc)

Do actual filtering:

TSS_trans_ratio_cutoff <- 0.02
upper_TSS_trans_ratio_cutoff <- 2.00
TSS_trans_ratio <- apply(TSS_norm[TSS_positions_unfiltered,],1,mean)/apply(trans_norm[TSS_positions_unfiltered,],1,mean)
TSS_above_cutoff <- subset(TSS_positions_unfiltered, TSS_trans_ratio[TSS_positions_unfiltered]>TSS_trans_ratio_cutoff & TSS_trans_ratio[TSS_positions_unfiltered]<upper_TSS_trans_ratio_cutoff)
TSS_nonReduced_Ranges <- create_GRanges_object(base::strsplit(TSS_above_cutoff, "-"), res_PSSTSS[TSS_above_cutoff,]$baseMean)
TSS_nonReduced_Ranges$name <- TSS_above_cutoff
length(reduce(TSS_nonReduced_Ranges))
## [1] 4792

For the actual analysis, we decided to only use TSS which are associated with a lot of evidence. Hence, we decided to only use those TSS, which are within a distance of 20nt of annotated TSS by Kopf et al. (2014).

highly_prob_TSS <- TSS_nonReduced_Ranges[which(countOverlaps(TSS_nonReduced_Ranges, anno_TSS, maxgap=20)>0)]
names_prob_TSS <- highly_prob_TSS$name
length(reduce(highly_prob_TSS))
## [1] 2905
highly_prob_TSS$type <- "TSS"
save_gff(c(highly_prob_TSS,PSS_nonReduced_Ranges), "output/annoTSS_comparison/gffs/", "TSS_PSS_rneAnalysis")

2.2 Characterization of newly identified TSS and PSS positions

2.2.1 Count how many PSS overlap with anno-TSS

write.csv(as.data.frame(PSS_nonReduced_Ranges[which(countOverlaps(PSS_nonReduced_Ranges, anno_TSS)>0)]), file="output/annoTSS_comparison/PSS_overlapping_annoTSS.csv")
# prepare data.frame to count numbers of PSS overlapping with anno-TSS
df_overlap_TSSPSS <- data.frame(type=rep("PSS", 4), TSS=c("start", "alt", "int", "i-start"))
df_overlap_TSSPSS$number <- NA

## count PSS overlapping with certain anno-TSS
# overlap of start anno_TSS _without_ TU_type: i with PSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="start",3] <- sum(countOverlaps(PSS_nonReduced_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & !grepl("i", anno_TSS$TU_type, fixed = TRUE))))

# overlap of start anno_TSS _with only_ TU_type: i with PSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="i-start",3] <- sum(countOverlaps(PSS_nonReduced_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="start" & grepl("i", anno_TSS$TU_type, fixed = TRUE))))

# overlap of alt anno_TSS with PSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="alt",3] <- sum(countOverlaps(PSS_nonReduced_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="alt")))

# overlap of int anno_TSS with PSS
df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS" & df_overlap_TSSPSS$TSS=="int",3] <- sum(countOverlaps(PSS_nonReduced_Ranges, subset(anno_TSS, anno_TSS$TSS_type=="int")))

#control
sum((countOverlaps(PSS_nonReduced_Ranges, anno_TSS)>0))
## [1] 47
sum(df_overlap_TSSPSS[df_overlap_TSSPSS$type=="PSS",]$number)
## [1] 47

The majority of anno-TSS which overlap with PSS are not canonical start TSS, but actual int or alt TSS.

p <- ggplot(data=df_overlap_TSSPSS, aes(x=type, y=number, fill=TSS)) + geom_bar(position="fill", stat="identity") + theme_light()+ xlab("") + ylab("Percentage") + scale_fill_grey(start = .2, end = .9)
p

ggsave(filename = "output/annoTSS_comparison/barplot_TSS-PSS-KopfTSS.pdf", plot = p, width = 7, height = 7, units = "cm")

Calculate how many PSS overlap with annotated regions:

sum(countOverlaps(PSS_nonReduced_Ranges,TUs)>0|countOverlaps(PSS_nonReduced_Ranges,features)>0)/length(PSS_nonReduced_Ranges)
## [1] 0.9869565

Calculate number and percentage of TUs and CDS which are overlapped by captured PSS:

sum(countOverlaps(TUs,PSS_nonReduced_Ranges)>0)
## [1] 656
sum(countOverlaps(TUs,PSS_nonReduced_Ranges)>0)/length(TUs)
## [1] 0.160352
sum(countOverlaps(features,PSS_nonReduced_Ranges)>0)
## [1] 737
sum(countOverlaps(features,PSS_nonReduced_Ranges)>0)/length(features)
## [1] 0.12015

2.2.2 Extract DNA sequences upstream of TSS and upstream + downstream of PSS

syne_fasta <- readDNAStringSet("input/Synechocystis.fasta", format="fasta")

Actually interesting stuff is only happening in region -20 nt upstream: Hence, only extract this region.

TSS_DNA_sequences_40nt <- extract_DNA_sequences(highly_prob_TSS, syne_fasta, len=20, only_upstream=TRUE)
PSS_DNA_sequences <- extract_DNA_sequences(PSS_nonReduced_Ranges, syne_fasta, len=10, only_upstream=FALSE)

writeXStringSet(TSS_DNA_sequences_40nt, "output/annoTSS_comparison/weblogos/allTSS_20nt_forWeblogo.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(PSS_DNA_sequences, "output/annoTSS_comparison/weblogos/PSS_10nt_forWeblogo_upAndDownstream.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

run createWeblogos_10-20nt_DNA.txt for creation of sequence logos

3 Use newly identified sets of TSS and PSS for identifying differentially expressed positions between rne(WT) and rne(Ts)

PSS_raw <- read.delim("input/PSS_5ends_combined.txt", header=TRUE, row.names=1)
PSS_use <- PSS_raw[PSS_positions_unfiltered,]

TSS_raw <- read.delim("input/TSS_5ends_combined.txt", header=TRUE, row.names=1)
TSS_use <- TSS_raw[names_prob_TSS,]

rm(PSS_raw)
rm(TSS_raw)

coldata <- read.csv("input/20210125_colData.csv", row.names=1)

ddsMat_PSS <- DESeqDataSetFromMatrix(countData = PSS_use,
                                 colData = coldata,
                                 design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_TSS <- DESeqDataSetFromMatrix(countData = TSS_use,
                                 colData = coldata,
                                 design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_PSS <- DESeq(ddsMat_PSS) 
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsMat_TSS <- DESeq(ddsMat_TSS) 
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
write.csv(data.frame(counts(ddsMat_PSS, normalized=TRUE)), file="output/PSS_normalizedCounts.csv")
write.csv(data.frame(counts(ddsMat_TSS, normalized=TRUE)), file="output/TSS_normalizedCounts.csv")

3.1 Diagnostic Plots

pdf(file="output/DESeq2_Plots/ddsMat_PSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_PSS, main="PSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/ddsMat_TSS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_TSS, main="TSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png 
##   2
plotDispEsts(ddsMat_PSS, main="PSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")

plotDispEsts(ddsMat_TSS, main="TSS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")

p <- PCA_plot(ddsMat_PSS, "PSS")
p 

ggsave("output/DESeq2_Plots/ddsMat_PSS_PCA.pdf", plot=p, width=9, height=9, units="cm")

p <- PCA_plot(ddsMat_TSS, "TSS")
p

ggsave("output/DESeq2_Plots/ddsMat_TSS_PCA.pdf", plot=p, width=9, height=9, units="cm")

p <- heatmap_plot(ddsMat_PSS, "PSS")
p

ggsave("output/DESeq2_Plots/ddsMat_PSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")

p <- heatmap_plot(ddsMat_TSS, "TSS")
p

ggsave("output/DESeq2_Plots/ddsMat_TSS_heatMap.pdf", plot=p, width=15, height=12, units="cm")

3.2 Extract Results

# extract results
PSS_result_dWT_dIF_1h <- results(ddsMat_PSS, contrast=c("condition", "dWT_1h", "dIF_1h")) # dWT/dIF -> higher in dWT: higher log2FC
write.csv(PSS_result_dWT_dIF_1h[order(PSS_result_dWT_dIF_1h$padj),], file="output/DESeq2_resultsTables/results_PSS-1h.csv")

PSS_result_dWT_dIF_0h <- results(ddsMat_PSS, contrast=c("condition", "dWT_0h", "dIF_0h")) # dWT/dIF -> higher in dWT: higher log2FC
write.csv(PSS_result_dWT_dIF_0h[order(PSS_result_dWT_dIF_0h$padj),], file="output/DESeq2_resultsTables/results_PSS-0h.csv")

PSS_result_dWT_0h_1h <- results(ddsMat_PSS, contrast=c("condition", "dWT_1h", "dWT_0h")) # dWT 1h/0h -> higher in 1h: higher log2FC
write.csv(PSS_result_dWT_0h_1h[order(PSS_result_dWT_0h_1h$padj),], file="output/DESeq2_resultsTables/results_PSS-dWT-0h-1h.csv")

PSS_result_dIF_0h_1h <- results(ddsMat_PSS, contrast=c("condition", "dIF_1h", "dIF_0h")) # dIF 1h/0h -> higher in 1h: higher log2FC
write.csv(PSS_result_dIF_0h_1h[order(PSS_result_dIF_0h_1h$padj),], file="output/DESeq2_resultsTables/results_PSS-dIF-0h-1h.csv")

TSS_result_dWT_dIF_1h <- results(ddsMat_TSS, contrast=c("condition", "dWT_1h", "dIF_1h")) # dWT/dIF -> higher in dWT: higher log2FC
write.csv(TSS_result_dWT_dIF_1h[order(TSS_result_dWT_dIF_1h$padj),], file="output/DESeq2_resultsTables/results_TSS-1h.csv")

TSS_result_dWT_dIF_0h <- results(ddsMat_TSS, contrast=c("condition", "dWT_0h", "dIF_0h")) # dWT/dIF -> higher in dWT: higher log2FC
write.csv(TSS_result_dWT_dIF_0h[order(TSS_result_dWT_dIF_0h$padj),], file="output/DESeq2_resultsTables/results_TSS-0h.csv")

TSS_result_dWT_0h_1h <- results(ddsMat_TSS, contrast=c("condition", "dWT_1h", "dWT_0h")) # dWT 1h/0h -> higher in 1h: higher log2FC
write.csv(TSS_result_dWT_0h_1h[order(TSS_result_dWT_0h_1h$padj),], file="output/DESeq2_resultsTables/results_TSS-dWT-0h-1h.csv")

Annotate peak regions

indices_plus <- which(strand(PSS_nonReduced_Ranges)=="+")
indices_minus <- which(strand(PSS_nonReduced_Ranges)=="-") 
PSS_names <- c()
PSS_names[indices_plus] <- paste(seqnames(PSS_nonReduced_Ranges[indices_plus]),start(PSS_nonReduced_Ranges[indices_plus]),"plus",sep="-")
PSS_names[indices_minus] <- paste(seqnames(PSS_nonReduced_Ranges[indices_minus]),start(PSS_nonReduced_Ranges[indices_minus]),"minus",sep="-")
rm(indices_plus)
rm(indices_minus)
PSS_nonReduced_Ranges$names <- PSS_names
PSS_nonReduced_Ranges$featuresOverlap <- rep("", length(PSS_nonReduced_Ranges))
PSS_nonReduced_Ranges$TU_overlap <- rep("", length(PSS_nonReduced_Ranges))
for(i in 1:length(PSS_nonReduced_Ranges)){
  PSS_nonReduced_Ranges$featuresOverlap[i] <- paste(features[which(countOverlaps(features,PSS_nonReduced_Ranges[i])>0)]$locus_tag,collapse=",")
  PSS_nonReduced_Ranges$TU_overlap[i] <- paste(TUs[which(countOverlaps(TUs,PSS_nonReduced_Ranges[i])>0)]$index,collapse=",")
}
# create tables with annotation: which features and TUs are overlapping with PSS?
df_PSS_nonRed_Ranges <- as.data.frame(PSS_nonReduced_Ranges)
row.names(df_PSS_nonRed_Ranges) <- df_PSS_nonRed_Ranges$names

# PSS result dWT dIF at 1h
PSS_result_dWT_dIF_1h_annot <- rownames_to_column(as.data.frame(PSS_result_dWT_dIF_1h[order(PSS_result_dWT_dIF_1h$padj),]))
PSS_result_dWT_dIF_1h_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_dWT_dIF_1h_annot$rowname,]$featuresOverlap
PSS_result_dWT_dIF_1h_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_dWT_dIF_1h_annot$rowname,]$TU_overlap
PSS_result_dWT_dIF_1h_annot <- PSS_result_dWT_dIF_1h_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_dWT_dIF_1h_annot, "output/DESeq2_resultsTables/results_PSS-1h_annotated.tsv")

# PSS result dWT dIF at 0h
PSS_result_dWT_dIF_0h_annot <- rownames_to_column(as.data.frame(PSS_result_dWT_dIF_0h[order(PSS_result_dWT_dIF_0h$padj),]))
PSS_result_dWT_dIF_0h_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_dWT_dIF_0h_annot$rowname,]$featuresOverlap
PSS_result_dWT_dIF_0h_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_dWT_dIF_0h_annot$rowname,]$TU_overlap
PSS_result_dWT_dIF_0h_annot <- PSS_result_dWT_dIF_0h_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_dWT_dIF_0h_annot, "output/DESeq2_resultsTables/results_PSS-0h_annotated.tsv")

# PSS result dWT comparing 0h and 1h
PSS_result_dWT_0h_1h_annot <- rownames_to_column(as.data.frame(PSS_result_dWT_0h_1h[order(PSS_result_dWT_0h_1h$padj),]))
PSS_result_dWT_0h_1h_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_dWT_0h_1h_annot$rowname,]$featuresOverlap
PSS_result_dWT_0h_1h_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_dWT_0h_1h_annot$rowname,]$TU_overlap
PSS_result_dWT_0h_1h_annot <- PSS_result_dWT_0h_1h_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_dWT_0h_1h_annot, "output/DESeq2_resultsTables/results_PSS-dWT-0h-1h_annotated.tsv")

# PSS result dIF comparing 0h and 1h
PSS_result_dIF_0h_1h_annot <- rownames_to_column(as.data.frame(PSS_result_dIF_0h_1h[order(PSS_result_dIF_0h_1h$padj),]))
PSS_result_dIF_0h_1h_annot$CDS <- df_PSS_nonRed_Ranges[PSS_result_dIF_0h_1h_annot$rowname,]$featuresOverlap
PSS_result_dIF_0h_1h_annot$TUs <- df_PSS_nonRed_Ranges[PSS_result_dIF_0h_1h_annot$rowname,]$TU_overlap
PSS_result_dIF_0h_1h_annot <- PSS_result_dIF_0h_1h_annot[,c(8,9,1,2:7)]
write_tsv(PSS_result_dIF_0h_1h_annot, "output/DESeq2_resultsTables/results_PSS-dIF-0h-1h_annotated.tsv")

Run in respective Directory (output/DESeq2_resultsTables/) python changeAnnotation_DESeq2.py results_PSS-1h_annotated.tsv results_PSS-1h_annotated-2.tsv python changeAnnotation_DESeq2.py results_PSS-0h_annotated.tsv results_PSS-0h_annotated-2.tsv python changeAnnotation_DESeq2.py results_PSS-dWT-0h-1h_annotated.tsv results_PSS-dWT-0h-1h_annotated-2.tsv python changeAnnotation_DESeq2.py results_PSS-dIF-0h-1h_annotated.tsv results_PSS-dIF-0h-1h_annotated-2.tsv

pdf(file="output/DESeq2_Plots/ddsMat_PSS-0h_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(PSS_result_dWT_dIF_0h, "PSS 0h")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/ddsMat_PSS-1h_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(PSS_result_dWT_dIF_1h, "PSS 1h")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/ddsMat_TSS-0h_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(TSS_result_dWT_dIF_0h, "TSS 0h")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/ddsMat_TSS-1h_pValuePlot.pdf", width=4.5, height=4.5)
pvaluePlot(TSS_result_dWT_dIF_1h, "TSS 1h")
dev.off()
## png 
##   2
pvaluePlot(PSS_result_dWT_dIF_0h, "PSS 0h")

pvaluePlot(PSS_result_dWT_dIF_1h, "PSS 1h")

pvaluePlot(PSS_result_dWT_0h_1h, "PSS dWT 0h-1h")

pvaluePlot(PSS_result_dIF_0h_1h, "PSS dIF 0h-1h")

pvaluePlot(TSS_result_dWT_dIF_0h, "TSS 0h")

pvaluePlot(TSS_result_dWT_dIF_1h, "TSS 1h")

pvaluePlot(TSS_result_dWT_0h_1h, "TSS dWT 0h-1h")

Filter out PSS below filter level of results(). For these, p.adj is set to NA - hence: remove positions at which p.adj==NA

PSS_result_dWT_dIF_0h <- subset(PSS_result_dWT_dIF_0h, !is.na(PSS_result_dWT_dIF_0h$padj))
PSS_result_dWT_dIF_1h <- subset(PSS_result_dWT_dIF_1h, !is.na(PSS_result_dWT_dIF_1h$padj))
PSS_result_dWT_0h_1h <- subset(PSS_result_dWT_0h_1h, !is.na(PSS_result_dWT_0h_1h$padj))
PSS_result_dIF_0h_1h <- subset(PSS_result_dIF_0h_1h, !is.na(PSS_result_dIF_0h_1h$padj))
TSS_result_dWT_dIF_0h <- subset(TSS_result_dWT_dIF_0h, !is.na(TSS_result_dWT_dIF_0h$padj))
TSS_result_dWT_dIF_1h <- subset(TSS_result_dWT_dIF_1h, !is.na(TSS_result_dWT_dIF_1h$padj))
TSS_result_dWT_0h_1h <- subset(TSS_result_dWT_0h_1h, !is.na(TSS_result_dWT_0h_1h$padj))

3.2.1 PSS 0h

count_up_down(PSS_result_dWT_dIF_0h, foldchange=1, padjusted=0.05)
## [1] "number of features down:  35"
## [1] "number of features up:  133"
p <- volcanoPlot_ggplot(as.data.frame(PSS_result_dWT_dIF_0h), foldchange=1, padjusted=0.05)
p

ggsave("output/DESeq2_Plots/ddsMat_PSS-0h_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(PSS_result_dWT_dIF_0h, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/rne(Ts))") + ylim(-5,+5)
p
## Warning: Removed 6 rows containing missing values (geom_point).

ggsave("output/DESeq2_Plots/ddsMat_PSS-0h_MAplot.pdf",plot=p, width=15, height=12, units="cm")
## Warning: Removed 6 rows containing missing values (geom_point).

3.2.2 PSS 1h

count_up_down(PSS_result_dWT_dIF_1h, foldchange=1, padjusted=0.05)
## [1] "number of features down:  725"
## [1] "number of features up:  747"
p <- volcanoPlot_ggplot(as.data.frame(PSS_result_dWT_dIF_1h), foldchange=1, padjusted=0.05)
p

ggsave("output/DESeq2_Plots/ddsMat_PSS-1h_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(PSS_result_dWT_dIF_1h, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/rne(Ts))") + ylim(-5,+5)
p
## Warning: Removed 21 rows containing missing values (geom_point).

ggsave("output/DESeq2_Plots/ddsMat_PSS-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")
## Warning: Removed 21 rows containing missing values (geom_point).

3.2.3 PSS heat treatment

In rne(WT):

count_up_down(PSS_result_dWT_0h_1h, foldchange = 1, padjusted = 0.05)
## [1] "number of features down:  277"
## [1] "number of features up:  292"
p <- MAplot_ggplot(PSS_result_dWT_0h_1h, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT) 1h/0h)") + ylim(-5,+5) + scale_colour_manual(values=c("DOWN"="black", "UP"="black", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
## Warning: Removed 30 rows containing missing values (geom_point).

ggsave("output/DESeq2_Plots/ddsMat_PSS-rneWT-0h-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")
## Warning: Removed 30 rows containing missing values (geom_point).

In rne(Ts):

count_up_down(PSS_result_dIF_0h_1h, foldchange = 1, padjusted = 0.05)
## [1] "number of features down:  379"
## [1] "number of features up:  571"
p <- MAplot_ggplot(PSS_result_dIF_0h_1h, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(Ts) 1h/0h)") + ylim(-5,+5) + scale_colour_manual(values=c("DOWN"="black", "UP"="black", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
## Warning: Removed 64 rows containing missing values (geom_point).

ggsave("output/DESeq2_Plots/ddsMat_PSS-rneTs-0h-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")
## Warning: Removed 64 rows containing missing values (geom_point).

3.2.4 PSS log2FC-log2FC comparisions

Create plot comparing log2FC before and after the heat shock in both rne(WT) and rne(Ts):

PSS_result_dIF_0h_1h_annot_copy <- PSS_result_dIF_0h_1h_annot
row.names(PSS_result_dIF_0h_1h_annot_copy) <- PSS_result_dIF_0h_1h_annot_copy$rowname
positions <- intersect(row.names(PSS_result_dIF_0h_1h), row.names(PSS_result_dWT_0h_1h))
df_log2FC_log2FC_plot <- data.frame(RNAfeat=PSS_result_dIF_0h_1h_annot_copy[positions,]$CDS, PSS=positions, log2FC_rneTs=PSS_result_dIF_0h_1h[positions,]$log2FoldChange, padj_rneTs=PSS_result_dIF_0h_1h[positions,]$padj, log2FC_rneWT=PSS_result_dWT_0h_1h[positions,]$log2FoldChange, padj_rneWT=PSS_result_dWT_0h_1h[positions,]$padj, log2FC_1h=PSS_result_dWT_dIF_1h$log2FoldChange, padj_1h=PSS_result_dWT_dIF_1h$padj, diff_WT_Ts=(PSS_result_dWT_0h_1h[positions,]$log2FoldChange-PSS_result_dIF_0h_1h[positions,]$log2FoldChange), ratio_PSS_transcript=apply(PSS_norm[positions,],1,mean)/apply(trans_norm[positions,],1,mean), baseMean=PSS_result_dWT_0h_1h[positions,]$baseMean)

write_tsv(df_log2FC_log2FC_plot, "output/DESeq2_resultsTables/combined_PSS_ddHeat-WT-Ts.tsv")

# color only positions which are differentially expressed after 1 hour heat shock:
df_log2FC_log2FC_plot$diffexpressed <- "NO"
fc=1.0
df_log2FC_log2FC_plot$diffexpressed[df_log2FC_log2FC_plot$log2FC_1h<(-fc) & df_log2FC_log2FC_plot$padj_1h<0.05] <- "DOWN"
df_log2FC_log2FC_plot$diffexpressed[df_log2FC_log2FC_plot$log2FC_1h>(fc) & df_log2FC_log2FC_plot$padj_1h<0.05] <- "UP"

# only label if padj <0.05 
df_log2FC_log2FC_plot[df_log2FC_log2FC_plot$diffexpressed=="NO","overlap"] <- NA

Run in respective Directory (output/DESeq2_resultsTables/) python changeAnnotation_DESeq2.py combined_PSS_ddHeat-WT-Ts.tsv combined_PSS_ddHeat-WT-Ts_annotated.tsv

xylim=6
p <- ggplot(df_log2FC_log2FC_plot, aes(y=log2FC_rneTs, x=log2FC_rneWT, col=diffexpressed, label=RNAfeat)) + geom_point(alpha=0.3, show.legend=FALSE) + theme_light() + geom_vline(xintercept=c(-fc,+fc), linetype="dotted", color="#686868") + geom_hline(yintercept=c(-fc,+fc), linetype="dotted", color="#686868") + xlim(-xylim,+xylim) + ylim(-xylim,+xylim) + xlab("Log2FC(1h/0h) rne(WT)") + ylab("Log2FC(1h/0h) rne(Ts)") + theme(legend.position="none") + scale_colour_manual(values=c("NO"="#d3d3d3b2", "DOWN"="#e69f00b2", "UP"="#005a96b2"))
p <- p + geom_text_repel(fontface="italic")
p
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 2771 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggsave("output/DESeq2_Plots/ddHeat_WT-Ts_plot.pdf",plot=p, width=15, height=12, units="cm")
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 2900 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave("output/DESeq2_Plots/ddHeat_WT-Ts_plot_moreSpace.pdf",plot=p, width=45, height=36, units="cm")
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 2583 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

3.2.5 TSS 0h

count_up_down(TSS_result_dWT_dIF_0h, foldchange=1, padjusted=0.05)
## [1] "number of features down:  14"
## [1] "number of features up:  49"
p <- volcanoPlot_ggplot(as.data.frame(TSS_result_dWT_dIF_0h), foldchange=1, padjusted=0.05)
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-0h_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(TSS_result_dWT_dIF_0h, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/rne(Ts))") + ylim(-7,+6)
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-0h_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3.2.6 TSS 1h

count_up_down(TSS_result_dWT_dIF_1h, foldchange=1, padjusted=0.05)
## [1] "number of features down:  337"
## [1] "number of features up:  228"
p <- volcanoPlot_ggplot(as.data.frame(TSS_result_dWT_dIF_1h), foldchange=1, padjusted=0.05)
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-1h_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(TSS_result_dWT_dIF_1h, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT)/rne(Ts))") + ylim(-7,+6)
p

ggsave("output/DESeq2_Plots/ddsMat_TSS-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3.2.7 TSS heat treatment

count_up_down(TSS_result_dWT_0h_1h, foldchange = 1, padjusted = 0.05)
## [1] "number of features down:  336"
## [1] "number of features up:  301"
count_up_down(TSS_result_dWT_0h_1h, foldchange = 2, padjusted = 0.05)
## [1] "number of features down:  67"
## [1] "number of features up:  24"
p <- MAplot_ggplot(TSS_result_dWT_0h_1h, foldchange=1.0, y_axis_label = "Log2 fold-change(rne(WT) 1h/0h)") + ylim(-7,+6) + scale_colour_manual(values=c("DOWN"="black", "UP"="black", "NO"="#d3d3d3ff"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p
## Warning: Removed 2 rows containing missing values (geom_point).

ggsave("output/DESeq2_Plots/ddsMat_TSS-rneWT-0h-1h_MAplot.pdf",plot=p, width=15, height=12, units="cm")
## Warning: Removed 2 rows containing missing values (geom_point).

3.3 Test if 5’-P / 5’-PPP ratio changes at TSS positions

TSSPSS_at_TSSpositions <- counts(ddsMat_PSSTSS, normalize=FALSE)[names_prob_TSS,] # normalize= TRUE/FALSE: should not make difference, but decided to use FALSE since should introduce less bias

# calculate ratios
dIF11_0h_ratio=TSSPSS_at_TSSpositions[,"dIF11_0h_PSS"]/TSSPSS_at_TSSpositions[,"dIF11_0h_TSS"]
dIF11_1h_ratio=TSSPSS_at_TSSpositions[,"dIF11_1h_PSS"]/TSSPSS_at_TSSpositions[,"dIF11_1h_TSS"]
dIF12_0h_ratio=TSSPSS_at_TSSpositions[,"dIF12_0h_PSS"]/TSSPSS_at_TSSpositions[,"dIF12_0h_TSS"]
dIF12_1h_ratio=TSSPSS_at_TSSpositions[,"dIF12_1h_PSS"]/TSSPSS_at_TSSpositions[,"dIF12_1h_TSS"]
dIF31_0h_ratio=TSSPSS_at_TSSpositions[,"dIF31_0h_PSS"]/TSSPSS_at_TSSpositions[,"dIF31_0h_TSS"]
dIF31_1h_ratio=TSSPSS_at_TSSpositions[,"dIF31_1h_PSS"]/TSSPSS_at_TSSpositions[,"dIF31_1h_TSS"]

dWT1_0h_ratio=TSSPSS_at_TSSpositions[,"dWT1_0h_PSS"]/TSSPSS_at_TSSpositions[,"dWT1_0h_TSS"]
dWT1_1h_ratio=TSSPSS_at_TSSpositions[,"dWT1_1h_PSS"]/TSSPSS_at_TSSpositions[,"dWT1_1h_TSS"]
dWT2_0h_ratio=TSSPSS_at_TSSpositions[,"dWT2_0h_PSS"]/TSSPSS_at_TSSpositions[,"dWT2_0h_TSS"]
dWT2_1h_ratio=TSSPSS_at_TSSpositions[,"dWT2_1h_PSS"]/TSSPSS_at_TSSpositions[,"dWT2_1h_TSS"]
dWT3_0h_ratio=TSSPSS_at_TSSpositions[,"dWT3_0h_PSS"]/TSSPSS_at_TSSpositions[,"dWT3_0h_TSS"]
dWT3_1h_ratio=TSSPSS_at_TSSpositions[,"dWT3_1h_PSS"]/TSSPSS_at_TSSpositions[,"dWT3_1h_TSS"]
# create df for plotting
df_TSSPSS_ratio <- as.data.frame(cbind(sample=c(rep("dIF1_0h", length(names_prob_TSS)),rep("dIF1_1h", length(names_prob_TSS)),rep("dIF2_0h", length(names_prob_TSS)),rep("dIF2_1h", length(names_prob_TSS)),rep("dIF3_0h", length(names_prob_TSS)),rep("dIF3_1h", length(names_prob_TSS)),rep("dWT1_0h", length(names_prob_TSS)),rep("dWT1_1h", length(names_prob_TSS)),rep("dWT2_0h", length(names_prob_TSS)),rep("dWT2_1h", length(names_prob_TSS)),rep("dWT3_0h", length(names_prob_TSS)),rep("dWT3_1h", length(names_prob_TSS))), type=c(rep("rne(Ts)", 6*length(names_prob_TSS)), rep("rne(WT)", 6*length(names_prob_TSS)))))
df_TSSPSS_ratio$TSSPSS_ratio <- c(dIF11_0h_ratio, dIF11_1h_ratio, dIF12_0h_ratio, dIF12_1h_ratio, dIF31_0h_ratio, dIF31_1h_ratio, dWT1_0h_ratio, dWT1_1h_ratio, dWT2_0h_ratio, dWT2_1h_ratio, dWT3_0h_ratio, dWT3_1h_ratio)
p <- ggplot(data=df_TSSPSS_ratio, aes(x=sample, y=TSSPSS_ratio, fill=type)) + geom_violin() +
  geom_boxplot(width=0.1, color="grey", alpha=0.2, outlier.shape=NA, show.legend = FALSE) +
  scale_fill_viridis(discrete=TRUE, alpha=0.6, option="A") +
  theme_light() +
  labs(fill="", y="PSS/TSS at TSS positions", x="")
p
## Warning: Removed 74 rows containing non-finite values (stat_ydensity).
## Warning: Removed 74 rows containing non-finite values (stat_boxplot).

df_TSSPSS_ratio_2 <- subset(df_TSSPSS_ratio, !is.na(df_TSSPSS_ratio$TSSPSS_ratio) & !is.infinite(df_TSSPSS_ratio$TSSPSS_ratio))
df_TSSPSS_ratio_2$TSSPSS_ratio <- df_TSSPSS_ratio_2$TSSPSS_ratio + 0.0001
p <- ggplot(data=df_TSSPSS_ratio_2, aes(x=sample, y=TSSPSS_ratio, fill=type)) + geom_violin() +
  geom_boxplot(width=0.1, color="grey", alpha=0.2, outlier.shape=NA, show.legend = FALSE) +
  scale_fill_viridis(discrete=TRUE, alpha=0.6, option="A") +
  theme_light() +
  labs(fill="", y="PSS/TSS at TSS positions", x="") + scale_y_continuous(trans="log10", limits=c(NA,10), breaks=c(0,0.01,0.1,1,10))
p

4 Exploratory data analysis

4.1 Extract Peak Positions and create GRanges objects

advanced_reduce_withBaseMeans() merges adjacent PSS positions. In a second step, peaks which are wider than 1nt are reduced to a 1nt wide position according to the baseMeans of the peaks - the most highly populated position is given. PSS_Ranges_up are peaks which are higher in rne(WT), PSS_Ranges_down peaks which are higher in rne(Ts).

4.1.1 For all positions

PSS_Ranges_up <- advanced_reduce_withBaseMeans(create_GRanges_object_from_resObject(PSS_result_dWT_dIF_1h, foldchange=1, padjusted=0.05))
PSS_Ranges_down <- advanced_reduce_withBaseMeans(create_GRanges_object_from_resObject(PSS_result_dWT_dIF_1h, foldchange=(-1), padjusted=0.05, up=FALSE))
PSS_Ranges <- c(PSS_Ranges_up, PSS_Ranges_down)

4.3 Extract data for MEME, MEME-ChIP

Use commands in MEME_analyses_RNaseE_peaks.txt to do MEME analyses.

peaks_all <- extract_RNA_sequences(PSS_Ranges, syne_fasta, 25)
peaks_up <- extract_RNA_sequences(PSS_Ranges_up, syne_fasta, 25)
peaks_down <- extract_RNA_sequences(PSS_Ranges_down, syne_fasta, 25)

writeXStringSet(peaks_all, "output/rneWT_rneTs_Weblogo/all_peaks_25upDownstream.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(peaks_up, "output/rneWT_rneTs_Weblogo/peaks_up_25upDownstream.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(peaks_down, "output/rneWT_rneTs_Weblogo/peaks_down_25upDownstream.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

4.4 Analyse overlap with chromosome/plasmids

Prepare data frames for the plots and count how many PSS are overlapping with which sequence and with how many of the respective features:

# prepare data.frame for barplot
sequences <- as.character(levels(seqnames(PSS_Ranges_up)))
updown <- c(rep("rne(WT)",length(sequences)), rep("rne(Ts)",length(sequences)))
df_PSS_sequence_barplot <- data.frame(cbind(sequence=sequences, updown=updown))
df_PSS_sequence_barplot$updown <- factor(df_PSS_sequence_barplot$updown, levels=c("rne(WT)", "rne(Ts)"))
df_PSS_sequence_barplot$x_labels <- NA
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="BA000022.2",]$x_labels <- "Chr"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004311.1",]$x_labels <- "pSYSA"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004312.1",]$x_labels <- "pSYSG"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP004310.1",]$x_labels <- "pSYSM"
df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence=="AP006585.1",]$x_labels <- "pSYSX"
df_PSS_sequence_barplot$x_labels <- factor(df_PSS_sequence_barplot$x_labels, levels=c("Chr", "pSYSA", "pSYSG", "pSYSM", "pSYSX"))
df_PSS_sequence_barplot$geno_size <- NA
for(i in df_PSS_sequence_barplot$sequence){
  df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence==i,]$geno_size <- seqlengths(syne_fasta)[i]
}
df_PSS_sequence_barplot$geno_relative_size <- df_PSS_sequence_barplot$geno_size/sum(seqlengths(syne_fasta))*100
df_PSS_sequence_barplot$number_feat_overlap <- NA
df_PSS_sequence_barplot$number_PSS_overlap <- NA 
df_PSS_sequence_barplot$number_feat_total <- NA
df_PSS_sequence_barplot$PSS_kb <- NA

# prepare data frame for number of overlaps with certain features
df_PSS_sequence_overlap_perKb <- data.frame()

# extract number of overlaps with different sequences and features, both for up- and downregulated PSS positions
for(s in sequences){
  index_up <- which(df_PSS_sequence_barplot$sequence==s & df_PSS_sequence_barplot$updown=="rne(WT)")
  index_down <- which(df_PSS_sequence_barplot$sequence==s & df_PSS_sequence_barplot$updown=="rne(Ts)") 
  
  df_PSS_sequence_barplot[index_up,"number_feat_overlap"] <- sum(countOverlaps(subset(features, seqnames(features)==s), PSS_Ranges_up)>0) # count number of features affected
  df_PSS_sequence_barplot[index_up,"number_PSS_overlap"] <- length(subset(PSS_Ranges_up, seqnames(PSS_Ranges_up)==s))
  df_PSS_sequence_barplot[index_up,"PSS_kb"] <- length(subset(PSS_Ranges_up, seqnames(PSS_Ranges_up)==s))/(seqlengths(syne_fasta[s])/1000)
    
  df_PSS_sequence_barplot[index_down,"number_feat_overlap"] <- sum(countOverlaps(subset(features, seqnames(features)==s), PSS_Ranges_down)>0) # count number of features affected
  df_PSS_sequence_barplot[index_down,"number_PSS_overlap"] <- length(subset(PSS_Ranges_down, seqnames(PSS_Ranges_down)==s))
  df_PSS_sequence_barplot[index_down,"PSS_kb"] <- length(subset(PSS_Ranges_down, seqnames(PSS_Ranges_down)==s))/(seqlengths(syne_fasta[s])/1000)
  
  df_PSS_sequence_barplot[df_PSS_sequence_barplot$sequence==s,"number_feat_total"] <- length(subset(features, seqnames(features)==s))
}

df_PSS_sequence_barplot$perc_feat <- (df_PSS_sequence_barplot$number_feat_overlap/df_PSS_sequence_barplot$number_feat_total)*100
df_PSS_sequence_barplot$perc_all_feat <- NA
df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$perc_all_feat <- df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$number_PSS_overlap/sum(df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(WT)",]$number_PSS_overlap)*100
df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(Ts)",]$perc_all_feat <- df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(Ts)",]$number_PSS_overlap/sum(df_PSS_sequence_barplot[df_PSS_sequence_barplot$updown=="rne(Ts)",]$number_PSS_overlap)*100
df_PSS_sequence_barplot
##      sequence  updown x_labels geno_size geno_relative_size number_feat_overlap
## 1  AP004310.1 rne(WT)    pSYSM    119895           3.037609                   3
## 2  AP004311.1 rne(WT)    pSYSA    103307           2.617342                  13
## 3  AP004312.1 rne(WT)    pSYSG     44343           1.123455                   3
## 4  AP006585.1 rne(WT)    pSYSX    106004           2.685672                   5
## 5  BA000022.2 rne(WT)      Chr   3573470          90.535921                 212
## 6  AP004310.1 rne(Ts)    pSYSM    119895           3.037609                   3
## 7  AP004311.1 rne(Ts)    pSYSA    103307           2.617342                   3
## 8  AP004312.1 rne(Ts)    pSYSG     44343           1.123455                   1
## 9  AP006585.1 rne(Ts)    pSYSX    106004           2.685672                   4
## 10 BA000022.2 rne(Ts)      Chr   3573470          90.535921                 215
##    number_PSS_overlap number_feat_total     PSS_kb perc_feat perc_all_feat
## 1                  11               134 0.09174695  2.238806     1.7377567
## 2                 118               111 1.14222657 11.711712    18.6413902
## 3                   2                51 0.04510295  5.882353     0.3159558
## 4                  14               112 0.13207049  4.464286     2.2116904
## 5                 488              5726 0.13656194  3.702410    77.0932070
## 6                   2               134 0.01668126  2.238806     0.3231018
## 7                   4               111 0.03871954  2.702703     0.6462036
## 8                   1                51 0.02255147  1.960784     0.1615509
## 9                   6               112 0.05660164  3.571429     0.9693053
## 10                606              5726 0.16958307  3.754803    97.8998384
write.csv(df_PSS_sequence_barplot, file="output/overlap_contigs_PSS.csv")
# Plot: PSS per kb
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=x_labels,y=PSS_kb, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("PSS/kb") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(Ts)"), labels=c("rne(WT)", "rne(Ts)"))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_PSSkb_chr_plasmids.pdf", plot = p, width = 14, height = 7, units = "cm")

# Plot: How many features of a certain kind are overlapped by PSS?
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=x_labels,y=perc_feat, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("Features overlapped by PSS (%)") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(Ts)"), labels=c("rne(WT)", "rne(Ts)"))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_percentageFeatures_chr_plasmids.pdf", plot = p, width = 14, height = 7, units = "cm")

# Plots: How many PSS are localized in which kind of feature?
p <- ggplot(data=df_PSS_sequence_barplot, aes(x=updown,y=number_PSS_overlap, fill=x_labels)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Percentage of all PSS (%)") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_absolute_chromo_plasmids_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")

p <- ggplot(data=df_PSS_sequence_barplot, aes(x=updown,y=number_PSS_overlap, fill=x_labels)) + geom_bar(stat="identity", position="fill") + theme_light()+ xlab("") + ylab("Number PSS") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_perc_chromo_plasmids_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")

For comparison, plot lengths of different parts of genome:

seqlengths_syne <- as.data.frame(seqlengths(syne_fasta))
seqlengths_syne$name <- NA
seqlengths_syne["BA000022.2",]$name <- "Chr"
seqlengths_syne["AP004311.1",]$name <- "pSYSA"
seqlengths_syne["AP004312.1",]$name <- "pSYSG"
seqlengths_syne["AP004310.1",]$name <- "pSYSM"
seqlengths_syne["AP006585.1",]$name <- "pSYSX"
seqlengths_syne$x <- "a"

p <- ggplot(data=seqlengths_syne, aes(x=x, y=seqlengths(syne_fasta), fill=name)) + geom_bar(stat="identity", position="fill") + theme_light()+ xlab("") + ylab("Percentage of total length") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_perc_chromo_plasmids_lengths.pdf", plot = p, width = 6, height = 7, units = "cm")

4.5 Analyse overlap with different RNA features and number of overlaps with those features

Prepare data frames for the plots and count how many PSS are overlapping with which features, how many features of a certain type are overlapped by PSS, how many PSS are located in each feature per kb.

# prepare data.frame for barplot
types <- rep(c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),3)
updown <- c(rep("up",9), rep("down",9), rep("both",9))
df_PSS_features <- data.frame(cbind(type=types, updown=updown))
df_PSS_features$number_feat_overlap <- NA 
df_PSS_features$number_PSS_overlap <- NA
df_PSS_features$number_total <- NA

# extract number of overlaps with different feature types, both for up- and downregulated PSS positions
types <- c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc")
for(t in types){
  t_feat <- which(features$type == t)
  
  index_up <- which(df_PSS_features$type==t & df_PSS_features$updown=="up")
  index_down <- which(df_PSS_features$type==t & df_PSS_features$updown=="down") 
  index_both <- which(df_PSS_features$type==t & df_PSS_features$updown=="both") 
  
  df_PSS_features[index_up,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_up)>0) # count number of features affected
  df_PSS_features[index_up,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_up, features[t_feat])>0) # count where PSS are located
  df_PSS_features[index_down,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_down)>0) # count number of features affected
  df_PSS_features[index_down,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_down, features[t_feat])>0) # count where PSS are located
  df_PSS_features[index_both,"number_feat_overlap"] <- sum(countOverlaps(features[t_feat], PSS_Ranges_down)>0 | countOverlaps(features[t_feat], PSS_Ranges_up)>0) # count number of features affected
  df_PSS_features[index_both,"number_PSS_overlap"] <- sum(countOverlaps(PSS_Ranges_down, features[t_feat])>0) + sum(countOverlaps(PSS_Ranges_up, features[t_feat])>0) # count where PSS are located
  
  df_PSS_features[df_PSS_features$type==t,"number_total"] <- length(features[t_feat])
  
}
df_PSS_features
##      type updown number_feat_overlap number_PSS_overlap number_total
## 1     CDS     up                 198                436         3675
## 2    5UTR     up                  19                 28          979
## 3    3UTR     up                   4                  5           29
## 4    tRNA     up                   0                  0           42
## 5    rRNA     up                   0                  0            6
## 6   ncRNA     up                   6                 14          318
## 7   asRNA     up                   6                  7         1071
## 8  CRISPR     up                   2                 39            3
## 9    misc     up                   1                  1           11
## 10    CDS   down                 182                491         3675
## 11   5UTR   down                  22                 31          979
## 12   3UTR   down                   0                  0           29
## 13   tRNA   down                   1                  1           42
## 14   rRNA   down                   0                  0            6
## 15  ncRNA   down                  10                 10          318
## 16  asRNA   down                  11                 11         1071
## 17 CRISPR   down                   0                  0            3
## 18   misc   down                   0                  0           11
## 19    CDS   both                 307                927         3675
## 20   5UTR   both                  36                 59          979
## 21   3UTR   both                   4                  5           29
## 22   tRNA   both                   1                  1           42
## 23   rRNA   both                   0                  0            6
## 24  ncRNA   both                  15                 24          318
## 25  asRNA   both                  17                 18         1071
## 26 CRISPR   both                   2                 39            3
## 27   misc   both                   1                  1           11
write.csv(df_PSS_features, file="output/overlap_RNAfeatures_PSS.csv")

for TUs:

sum(countOverlaps(TUs, PSS_Ranges_up)>0)
## [1] 248
sum(countOverlaps(TUs, PSS_Ranges_down)>0)
## [1] 237
sum((countOverlaps(TUs, PSS_Ranges_up)|countOverlaps(TUs, PSS_Ranges_down))>0)
## [1] 380

4.5.1 Create bar plots

Do further processing of data frames.

df_PSS_features_barplot <- subset(df_PSS_features, df_PSS_features$updown=="up" | df_PSS_features$updown=="down")

total_overlaps_up <- sum(countOverlaps(PSS_Ranges_up, features)>0)
total_overlaps_down <- sum(countOverlaps(PSS_Ranges_down, features)>0)

df_PSS_features_barplot[,"number_feat_overlap"] <- as.integer(df_PSS_features_barplot[,"number_feat_overlap"])/as.integer(df_PSS_features_barplot[,"number_total"])*100

# create column  with absolute numbers
df_PSS_features_barplot$number_PSS_overlap_absolute <- NA
df_PSS_features_barplot[df_PSS_features_barplot$updown=="up",]$number_PSS_overlap_absolute <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"])
df_PSS_features_barplot[df_PSS_features_barplot$updown=="down",]$number_PSS_overlap_absolute <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"])

# calculate percentages
df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"] <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="up","number_PSS_overlap"])/total_overlaps_up *100
df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"] <- as.integer(df_PSS_features_barplot[df_PSS_features_barplot$updown=="down","number_PSS_overlap"])/total_overlaps_down *100

# rename types (5'UTR instead of 5UTR, 3'UTR instead of 3UTR)
df_PSS_features_barplot[,"type"] <- rep(c("CDS","5'UTR", "3'UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),2)

# reduce plot to information I actually care about
df_PSS_features_barplot <- df_PSS_features_barplot[which(!df_PSS_features_barplot$type=="rRNA"),] # rRNAs cannot be counted here, because multireads were excluded from analysis
df_PSS_features_barplot <- df_PSS_features_barplot[which(!df_PSS_features_barplot$type=="misc"),] # misc are actually only genes which are split by plasmid / chromosome beginning and two features (NC-2306445, NC-3547510) where I never figured out what they are. One PSS overlaps with sll8049-1. So I think one can savely ignore those

Do actual plotting.

# cast type, updown as factors to influence order in which they show up in the plots
df_PSS_features_barplot$type <- factor(df_PSS_features_barplot$type, levels=c("tRNA", "asRNA", "ncRNA", "CRISPR", "3'UTR", "5'UTR", "CDS"))
df_PSS_features_barplot$updown <- c(rep("rne(WT)", 7), rep("rne(Ts)",7))
df_PSS_features_barplot$updown <- factor(df_PSS_features_barplot$updown, levels=c("rne(WT)", "rne(Ts)"))

# Plot: How many features of a certain kind are overlapped by PSS?
p <- ggplot(data=df_PSS_features_barplot, aes(x=type,y=number_feat_overlap, fill=updown)) + geom_bar(stat="identity", position="dodge") + theme_light()+ xlab("") + ylab("Percentage PSS (%)") + scale_fill_manual(name="", values=c("#005a96ff", "#e69f00ff"), breaks=c("rne(WT)", "rne(Ts)"), labels=c("rne(WT)", "rne(Ts)"))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_percentage_of_allThisFeature.pdf", plot = p, width = 14, height = 7, units = "cm")

# Plots: How many PSS are localized in which kind of feature?
p <- ggplot(data=df_PSS_features_barplot, aes(x=updown,y=number_PSS_overlap, fill=type)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Percentage of all PSS (%)") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_percentage_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")

p <- ggplot(data=df_PSS_features_barplot, aes(x=updown,y=number_PSS_overlap_absolute, fill=type)) + geom_bar(stat="identity", position="stack") + theme_light()+ xlab("") + ylab("Number PSS") + scale_fill_brewer(palette="Paired", direction=1)#scale_fill_OkabeIto(name="", order=c(2:8,1))
p

ggsave(filename = "output/barplots_overlapFeatures/barplot_absolute_ofAllPSS.pdf", plot = p, width = 7, height = 7, units = "cm")

4.6 GO and KEGG enrichment in set of detected PSS

Control if, in general, PSS are rather detected in certain set of genes. GO annotation only exists for CDS, not for other features: Therefore, only use CDS for functional enrichment.

Detectable PSS are enriched in genes associated with photosynthesis or ribosomes. I assume both sets are, compared to other sets of genes, highly transcribed and that’s the reason why PSS can be detected in those genes. Probably, there are also stable processed RNA fragments created from other sets of transcripts, but they might be too lowly transcribed to be captured by tagRNA-Seq.

CDS_features <- features[which(features$type=="CDS")]
locus_tags_all_PSS <- CDS_features[which(countOverlaps(CDS_features, PSS_nonReduced_Ranges)>0)]$locus_tag
# compared to all CDS
kegg_PSS_all_enrich <- kegg_functional_enrichment_locusList(locus_tags_all_PSS, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_allPSS.csv")
## Reading KEGG annotation online:
## 
## Reading KEGG annotation online:
##                ID                                 Description GeneRatio BgRatio
## syn00195 syn00195                              Photosynthesis    33/217  63/953
## syn00196 syn00196           Photosynthesis - antenna proteins    12/217  15/953
## syn00710 syn00710 Carbon fixation in photosynthetic organisms    11/217  18/953
## syn03010 syn03010                                    Ribosome    22/217  54/953
##                pvalue     p.adjust       qvalue
## syn00195 1.019874e-07 6.221232e-06 5.904535e-06
## syn00196 3.546420e-06 1.081658e-04 1.026595e-04
## syn00710 4.666676e-04 9.488908e-03 9.005866e-03
## syn03010 1.790114e-03 2.729924e-02 2.590955e-02
go_PSS_all_enrich <- go_functional_enrichment_locusList(locus_tags_all_PSS, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_allPSS.csv")
##                    ID                            Description GeneRatio BgRatio
## GO:0042651 GO:0042651                     thylakoid membrane    42/438 96/2593
## GO:0015979 GO:0015979                         photosynthesis    39/438 86/2593
## GO:0018298 GO:0018298            protein-chromophore linkage    17/438 31/2593
## GO:0030089 GO:0030089                          phycobilisome    14/438 24/2593
## GO:0015986 GO:0015986 ATP synthesis coupled proton transport     8/438 10/2593
## GO:0003735 GO:0003735     structural constituent of ribosome    22/438 56/2593
##                  pvalue     p.adjust       qvalue
## GO:0042651 2.656795e-10 2.696962e-08 2.288332e-08
## GO:0015979 3.269045e-10 2.696962e-08 2.288332e-08
## GO:0018298 1.454275e-06 7.998514e-05 6.786618e-05
## GO:0030089 4.812310e-06 1.985078e-04 1.684309e-04
## GO:0015986 2.053594e-05 6.776859e-04 5.750062e-04
## GO:0003735 4.621568e-05 1.197139e-03 1.015755e-03
##                                                                                                                                                                                                                                                                                                                                                     geneID
## GO:0042651 slr1311/sll1327/sll1326/sll1324/sll1323/ssl2615/sll1322/slr1291/slr1295/smr0005/slr1604/sml0005/smr0006/sll1814/slr1834/slr1835/sll1463/sll1281/ssl2501/sml0007/sll0851/sll0849/slr0261/slr1513/slr1329/slr1330/sml0008/sll1867/slr1281/ssr2831/sll0258/ssl0563/sml0001/smr0001/slr0342/sll0199/slr0906/slr0083/sll0047/sll0520/sll0519/smr0004
## GO:0015979                         sll1214/slr0737/sll1031/sll1029/sll1194/sll1184/sll0928/smr0005/slr1055/sml0005/smr0006/slr2051/ssl3093/sll1580/sll1579/sll1578/sll1577/slr1834/slr1835/sll1281/sml0007/slr2067/slr1986/ssr3383/sml0008/sll0819/slr1347/sll1091/ssr2831/sll0427/sll0258/slr0335/sml0001/smr0001/slr0342/slr0436/sll0047/slr1459/smr0004
## GO:0018298                                                                                                                                                                                                         slr1311/sll0928/slr0687/sll1578/sll1577/slr1834/slr1835/sll0851/sll0849/slr2067/slr1986/slr1963/sll1867/slr0335/slr0906/sll0041/slr1459
## GO:0030089                                                                                                                                                                                                                                 sll0928/slr2051/ssl3093/sll1580/sll1579/sll1578/sll1577/slr2067/slr1986/ssr3383/slr1963/slr1687/slr0335/slr1459
## GO:0015986                                                                                                                                                                                                                                                                                 sll1327/sll1326/sll1324/sll1323/ssl2615/sll1322/slr1329/slr1330
## GO:0003735                                                                                                                                                                 ssr1736/sll1824/ssl3445/sll1819/sml0006/sll1813/sll1808/sll1803/sll1802/sll1800/ssl2233/sll1746/sll1744/slr1356/sll1260/sll1101/sll1097/slr1678/ssr2799/sll0767/slr0469/slr0628
##            Count
## GO:0042651    42
## GO:0015979    39
## GO:0018298    17
## GO:0030089    14
## GO:0015986     8
## GO:0003735    22
p <- dotplot(kegg_PSS_all_enrich)
## wrong orderBy parameter; set to default `orderBy = "x"`
p

ggsave("output/enrichment/KEGG_dotplot_allPSS.pdf", plot=p, width=20, height=14, units="cm")
p <- dotplot(go_PSS_all_enrich, showCategory=20)
## wrong orderBy parameter; set to default `orderBy = "x"`
p

ggsave("output/enrichment/GO_dotplot_allPSS.pdf", plot=p, width=20, height=14, units="cm")

Compared to the positions where PSS were generally detected, no terms are enriched in set of PSS upregulated (higher in rne(WT)).

# enrichment in rne(WT) PSS
locus_tags_up <- CDS_features[which(countOverlaps(CDS_features, PSS_Ranges_up)>0)]$locus_tag
# compared to all CDS
kegg_PSS_up_enrich <- kegg_functional_enrichment_locusList(locus_tags_up, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_PSSup_universeAllCDS.csv")
##                ID                       Description GeneRatio BgRatio
## syn00195 syn00195                    Photosynthesis     15/78  63/953
## syn00196 syn00196 Photosynthesis - antenna proteins      6/78  15/953
##                pvalue    p.adjust      qvalue
## syn00195 6.702867e-05 0.002882233 0.002751703
## syn00196 6.854271e-04 0.014736683 0.014069293
go_PSS_up_enrich <- go_functional_enrichment_locusList(locus_tags_up, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_PSSup_universeAllCDS.csv")
##                    ID                 Description GeneRatio BgRatio
## GO:0015979 GO:0015979              photosynthesis    20/157 86/2593
## GO:0018298 GO:0018298 protein-chromophore linkage     9/157 31/2593
## GO:0030089 GO:0030089               phycobilisome     7/157 24/2593
## GO:0006457 GO:0006457             protein folding     6/157 19/2593
## GO:0051082 GO:0051082    unfolded protein binding     5/157 15/2593
## GO:0042651 GO:0042651          thylakoid membrane    14/157 96/2593
##                  pvalue     p.adjust       qvalue
## GO:0015979 7.655460e-08 1.025832e-05 9.267135e-06
## GO:0018298 5.541107e-05 3.712541e-03 3.353828e-03
## GO:0030089 3.764334e-04 1.681403e-02 1.518942e-02
## GO:0006457 6.284358e-04 2.105260e-02 1.901845e-02
## GO:0051082 1.396511e-03 3.435370e-02 3.103437e-02
## GO:0042651 1.538226e-03 3.435370e-02 3.103437e-02
##                                                                                                                                                                     geneID
## GO:0015979 sll1214/slr0737/sll1194/sll1184/smr0005/slr1055/slr2051/sll1580/sll1578/sll1577/slr1834/slr1835/slr1986/sml0008/sll1091/ssr2831/sll0427/slr0335/smr0001/slr0342
## GO:0018298                                                                                         slr1311/sll1578/sll1577/slr1834/slr1835/sll0851/slr1986/slr1963/slr0335
## GO:0030089                                                                                                         slr2051/sll1580/sll1578/sll1577/slr1986/slr1963/slr0335
## GO:0006457                                                                                                                 sll1932/slr2076/sll1988/slr0093/sll0535/sll0533
## GO:0051082                                                                                                                         sll1932/slr2076/sll1988/slr0093/sll0535
## GO:0042651                                                 slr1311/sll1322/smr0005/slr1834/slr1835/sll1463/ssl2501/sll0851/sml0008/ssr2831/ssl0563/smr0001/slr0342/sll0199
##            Count
## GO:0015979    20
## GO:0018298     9
## GO:0030089     7
## GO:0006457     6
## GO:0051082     5
## GO:0042651    14
# compared to universe: Where are PSS generally localized / what was detectable
kegg_PSS_up_enrich_2 <- kegg_functional_enrichment_locusList(list_of_genes=locus_tags_up, universe_tags=locus_tags_all_PSS, write=TRUE, path="output/enrichment/KEGG_PSSup_universeDetectedCDS.csv")
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue     
## <0 rows> (or 0-length row.names)
go_PSS_up_enrich_2 <- go_functional_enrichment_locusList(locus_tags_up, universe_tags=locus_tags_all_PSS, write=TRUE, path="output/enrichment/GO_PSSup_universeDetectedCDS.csv")
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 rows> (or 0-length row.names)

When using all positions of PSS as universe, syn00195 (Photosynthesis) and syn00196 (Photosynthesis - antenna proteins) are enriched in the PSS ranges down data set. The enriched GO terms match those terms, except that “nucleic adic binding” is added.

# enrichment in rne(WT) PSS
locus_tags_down <- CDS_features[which(countOverlaps(CDS_features, PSS_Ranges_down)>0)]$locus_tag
# compared to all CDS
kegg_PSS_down_enrich <- kegg_functional_enrichment_locusList(locus_tags_down, CDS_features$locus_tag, write=TRUE, path="output/enrichment/KEGG_PSSdown_universeAllCDS.csv")
##                ID                       Description GeneRatio BgRatio
## syn00195 syn00195                    Photosynthesis     24/90  63/953
## syn00196 syn00196 Photosynthesis - antenna proteins     10/90  15/953
## syn03018 syn03018                   RNA degradation      6/90  14/953
##                pvalue     p.adjust       qvalue
## syn00195 1.099909e-10 4.729608e-09 4.515416e-09
## syn00196 7.080022e-08 1.522205e-06 1.453268e-06
## syn03018 9.717015e-04 1.392772e-02 1.329697e-02
go_PSS_down_enrich <- go_functional_enrichment_locusList(locus_tags_down, CDS_features$locus_tag, write=TRUE, path="output/enrichment/GO_PSSdown_universeAllCDS.csv")
##                    ID                                      Description
## GO:0015979 GO:0015979                                   photosynthesis
## GO:0018298 GO:0018298                      protein-chromophore linkage
## GO:0042651 GO:0042651                               thylakoid membrane
## GO:0030089 GO:0030089                                    phycobilisome
## GO:0030096 GO:0030096 plasma membrane-derived thylakoid photosystem II
## GO:0016168 GO:0016168                              chlorophyll binding
##            GeneRatio BgRatio       pvalue     p.adjust       qvalue
## GO:0015979    27/153 86/2593 5.813971e-14 7.267463e-12 6.425968e-12
## GO:0018298    15/153 31/2593 2.470232e-11 1.543895e-09 1.365128e-09
## GO:0042651    25/153 96/2593 5.850269e-11 2.437612e-09 2.155362e-09
## GO:0030089    12/153 24/2593 1.705428e-09 5.329462e-08 4.712366e-08
## GO:0030096    10/153 28/2593 1.998543e-06 4.996357e-05 4.417832e-05
## GO:0016168     6/153 10/2593 6.611939e-06 1.377487e-04 1.217989e-04
##                                                                                                                                                                                                                             geneID
## GO:0015979 sll1214/slr0737/sll1194/sll1184/sll0928/sml0005/slr2051/ssl3093/sll1580/sll1579/sll1578/sll1577/slr1834/slr1835/sll1281/sml0007/slr1986/sml0008/sll0819/sll1091/ssr2831/sll0258/slr0335/smr0001/slr0342/slr1459/smr0004
## GO:0018298                                                                                                 slr1311/sll0928/slr0687/sll1578/sll1577/slr1834/slr1835/sll0851/slr1986/slr1963/sll1867/slr0335/slr0906/sll0041/slr1459
## GO:0042651                 slr1311/sll1326/ssl2615/sll1322/sml0005/sll1814/slr1834/slr1835/sll1281/sml0007/sll0851/slr0261/slr1513/slr1330/sml0008/sll1867/ssr2831/sll0258/ssl0563/smr0001/slr0342/sll0199/slr0906/slr0083/smr0004
## GO:0030089                                                                                                                         sll0928/slr2051/ssl3093/sll1580/sll1579/sll1578/sll1577/slr1986/slr1963/slr1687/slr0335/slr1459
## GO:0030096                                                                                                                                         slr1311/sll1194/sml0005/sll1281/sml0007/sll0851/sll1867/sll0258/smr0001/slr0906
## GO:0016168                                                                                                                                                                         slr1311/slr1834/slr1835/sll0851/sll1867/slr0906
##            Count
## GO:0015979    27
## GO:0018298    15
## GO:0042651    25
## GO:0030089    12
## GO:0030096    10
## GO:0016168     6
# compared to universe: Where are PSS generally localized / what was detectable
kegg_PSS_down_enrich_2 <- kegg_functional_enrichment_locusList(locus_tags_down, locus_tags_all_PSS, write=TRUE, path="output/enrichment/KEGG_PSSdown_universeDetectedCDS.csv")
##                ID                       Description GeneRatio BgRatio
## syn00195 syn00195                    Photosynthesis     24/90  33/217
## syn00196 syn00196 Photosynthesis - antenna proteins     10/90  12/217
##                pvalue    p.adjust       qvalue
## syn00195 8.589097e-05 0.001030692 0.0009041155
## syn00196 3.073733e-03 0.018442395 0.0161775399
go_PSS_down_enrich_2 <- go_functional_enrichment_locusList(locus_tags_down, locus_tags_all_PSS, write=TRUE, path="output/enrichment/GO_PSSdown_universeDetectedCDS.csv")
##                    ID                                      Description
## GO:0015979 GO:0015979                                   photosynthesis
## GO:0018298 GO:0018298                      protein-chromophore linkage
## GO:0030089 GO:0030089                                    phycobilisome
## GO:0042651 GO:0042651                               thylakoid membrane
## GO:0003676 GO:0003676                             nucleic acid binding
## GO:0030096 GO:0030096 plasma membrane-derived thylakoid photosystem II
##            GeneRatio BgRatio       pvalue    p.adjust       qvalue
## GO:0015979    27/153  39/438 5.102753e-06 0.000132952 0.0001186528
## GO:0018298    15/153  17/438 5.780523e-06 0.000132952 0.0001186528
## GO:0030089    12/153  14/438 1.083580e-04 0.001661490 0.0014827943
## GO:0042651    25/153  42/438 5.460848e-04 0.006279976 0.0056045550
## GO:0003676     8/153  10/438 4.337163e-03 0.039901896 0.0356103877
## GO:0030096    10/153  14/438 5.250170e-03 0.040251306 0.0359222180
##                                                                                                                                                                                                                             geneID
## GO:0015979 sll1214/slr0737/sll1194/sll1184/sll0928/sml0005/slr2051/ssl3093/sll1580/sll1579/sll1578/sll1577/slr1834/slr1835/sll1281/sml0007/slr1986/sml0008/sll0819/sll1091/ssr2831/sll0258/slr0335/smr0001/slr0342/slr1459/smr0004
## GO:0018298                                                                                                 slr1311/sll0928/slr0687/sll1578/sll1577/slr1834/slr1835/sll0851/slr1986/slr1963/sll1867/slr0335/slr0906/sll0041/slr1459
## GO:0030089                                                                                                                         sll0928/slr2051/ssl3093/sll1580/sll1579/sll1578/sll1577/slr1986/slr1963/slr1687/slr0335/slr1459
## GO:0042651                 slr1311/sll1326/ssl2615/sll1322/sml0005/sll1814/slr1834/slr1835/sll1281/sml0007/sll0851/slr0261/slr1513/slr1330/sml0008/sll1867/ssr2831/sll0258/ssl0563/smr0001/slr0342/sll0199/slr0906/slr0083/smr0004
## GO:0003676                                                                                                                                                         slr1130/slr0692/sll1910/ssr1480/sll0362/slr0915/slr0083/sll0517
## GO:0030096                                                                                                                                         slr1311/sll1194/sml0005/sll1281/sml0007/sll0851/sll1867/sll0258/smr0001/slr0906
##            Count
## GO:0015979    27
## GO:0018298    15
## GO:0030089    12
## GO:0042651    25
## GO:0003676     8
## GO:0030096    10
p <- dotplot(kegg_PSS_down_enrich_2)
## wrong orderBy parameter; set to default `orderBy = "x"`
p 

ggsave("output/enrichment/KEGG_dotplot_PSS_down.pdf", plot=p, width=20, height=14, units="cm")
p <- dotplot(go_PSS_down_enrich_2)
## wrong orderBy parameter; set to default `orderBy = "x"`
p

ggsave("output/enrichment/GO_dotplot_PSSdown.pdf", plot=p, width=20, height=14, units="cm")

4.7 Distance of PSS to Start / Stop position

count_overlaps_xNt_upDownstream <- function(Ranges_object, Ranges_object_2, sequence_object, start_stop="start", len=25){
  # function returns df with total number overlapping between Ranges_object and Ranges_object_2, iterating from start or stop codon (choose by setting start_stop to "start" or "stop") of Ranges_object +/- nucleotide positions. 
  
  # prepare df to return
  df_overlaps <- data.frame(rbind(rep("NA", length((-len):(+len)))))
  names(df_overlaps) <- (-len):(+len)
    
  # extract lengths of sequences (needed for catching cases in which start_end -len or +len exceeds chromosome/plasmid length) -> if this is the case: since circular DNA: start from other end with counting again (see below)
  lengths <- c()
  for(i in 1:length(sequence_object)){
    lengths[i] <- width(sequence_object[i])
  }
  names(lengths) <- names(sequence_object)
  
  # extract info about what to extract (depending on strength: upstream corresponds to either + or -)
  strands <- data.frame(Ranges_object)[,5]
  plus_strand <- which(strand(Ranges_object)=="+")
  minus_strand <- which(strand(Ranges_object)=="-")
  seqs <- as.character(data.frame(Ranges_object)[,1])
  
  for(d in (-len):(+len)){
    positions <- c()
  
    # get positions of start_end on plus and minus strand
    if(start_stop=="start"){
      positions[plus_strand] <- start(Ranges_object[plus_strand])-(-d) # e.g. - (- -25) -> corresponds to -25
      positions[minus_strand] <- end(Ranges_object[minus_strand])+(-d) # e.g. + (- -25) -> corresponds to +25
    }else{
      positions[plus_strand] <- end(Ranges_object[plus_strand])-(-d) # e.g. - (- -25) -> corresponds to -25
      positions[minus_strand] <- start(Ranges_object[minus_strand])+(-d) # e.g. + (- -25) -> corresponds to +25
    }
    # positions < 0
    indices <- which(positions < 0)
    positions[indices] <- lengths[seqs[indices]]-(-positions[indices])
    
    # positions at which positions > length of respective plasmid / chromosome
    indices <- which(positions>lengths[seqs])
    positions[indices] <- positions[indices]-lengths[seqs[indices]]
    
    # create GRanges object
    temp_GRanges <- GRanges(seqnames=seqs, ranges=IRanges(start = positions, end=positions), strand = strands)
    
    # count overlaps    
    overlap_vect <- countOverlaps(temp_GRanges, Ranges_object_2)
    df_overlaps[,as.character(d)] <- sum(overlap_vect)
  }
  
  return(df_overlaps)
}

4.7.1 For PSS which are upregulated in rne(WT)

overlaps_PSS_up_start_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_up, syne_fasta, len = 300, start_stop = "start")
overlaps_PSS_up_stop_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_up, syne_fasta, len = 300, start_stop = "stop")

overlaps_PSS_up_start_CDS_ggplot <- data.frame(t(overlaps_PSS_up_start_CDS))
overlaps_PSS_up_start_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_up_start_CDS_ggplot))
names(overlaps_PSS_up_start_CDS_ggplot)[1] <- "count"

overlaps_PSS_up_stop_CDS_ggplot <- data.frame(t(overlaps_PSS_up_stop_CDS))
overlaps_PSS_up_stop_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_up_stop_CDS_ggplot))
names(overlaps_PSS_up_stop_CDS_ggplot)[1] <- "count"
p <- ggplot(overlaps_PSS_up_start_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#005a96ff") + theme_light() + labs(x="Distance relative to start codon (nt)", y="Number of cleavage sites") + ylim(0,8)+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")
p

ggsave("output/distance_startStop/PSS_Up_count_relativeStartCodon.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(overlaps_PSS_up_stop_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#005a96ff") + theme_light() + labs(x="Distance relative to stop codon (nt)", y="Number of cleavage sites") + ylim(0,8) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray")
p

ggsave("output/distance_startStop/PSS_Up_count_relativeStopCodon.pdf", plot=p, width=10, height=7, units="cm")
overlaps_PSS_up_start_CDS[which(overlaps_PSS_up_start_CDS==max(overlaps_PSS_up_start_CDS))]
##   -1
## 1  6
overlaps_PSS_up_stop_CDS[which(overlaps_PSS_up_stop_CDS==max(overlaps_PSS_up_stop_CDS))]
##   2
## 1 6

4.7.1.1 Identify positions at which PSS are localised exactly upstream of start codons

# extract lengths of sequences (needed for catching cases in which start_end -len or +len exceeds chromosome/plasmid length) -> if this is the case: since circular DNA: start from other end with counting again (see below)
lengths_syne <- c()
for(i in 1:length(syne_fasta)){
  lengths_syne[i] <- width(syne_fasta[i])
}
names(lengths_syne) <- names(syne_fasta)
  
# extract info about what to extract (depending on strength: upstream corresponds to either + or -)
strands <- data.frame(CDS_features)[,5]
plus_strand <- which(strand(CDS_features)=="+")
minus_strand <- which(strand(CDS_features)=="-")
seqs <- as.character(data.frame(CDS_features)[,1])

positions <- c()
  
## for start codons
# get positions of start_end on plus and minus strand
positions[plus_strand] <- start(CDS_features[plus_strand])-1
positions[minus_strand] <- end(CDS_features[minus_strand])+1
    
# positions < 0
indices <- which(positions < 0)
positions[indices] <- lengths_syne[seqs[indices]]-(-positions[indices])
    
# positions at which positions > length of respective plasmid / chromosome
indices <- which(positions>lengths_syne[seqs])
positions[indices] <- positions[indices]-lengths_syne[seqs[indices]]
    
# create GRanges object
temp_GRanges <- GRanges(seqnames=seqs, ranges=IRanges(start = positions, end=positions), strand = strands)
    
print(temp_GRanges[which(countOverlaps(temp_GRanges, PSS_Ranges_up)>0)])
## GRanges object with 6 ranges and 0 metadata columns:
##         seqnames    ranges strand
##            <Rle> <IRanges>  <Rle>
##   [1] BA000022.2      7228      +
##   [2] BA000022.2    727467      -
##   [3] BA000022.2    983803      +
##   [4] BA000022.2   2375091      +
##   [5] BA000022.2   2610072      +
##   [6] BA000022.2   3370058      +
##   -------
##   seqinfo: 5 sequences from an unspecified genome; no seqlengths
## for stop codons
# get positions of start_end on plus and minus strand
positions[plus_strand] <- end(CDS_features[plus_strand])+2
positions[minus_strand] <- start(CDS_features[minus_strand])-2
    
# positions < 0
indices <- which(positions < 0)
positions[indices] <- lengths_syne[seqs[indices]]-(-positions[indices])
    
# positions at which positions > length of respective plasmid / chromosome
indices <- which(positions>lengths_syne[seqs])
positions[indices] <- positions[indices]-lengths_syne[seqs[indices]]
    
# create GRanges object
temp_GRanges <- GRanges(seqnames=seqs, ranges=IRanges(start = positions, end=positions), strand = strands)
    
print(temp_GRanges[which(countOverlaps(temp_GRanges, PSS_Ranges_up)>0)])
## GRanges object with 6 ranges and 0 metadata columns:
##         seqnames    ranges strand
##            <Rle> <IRanges>  <Rle>
##   [1] AP006585.1     27798      +
##   [2] AP004310.1    103153      +
##   [3] BA000022.2   1687324      -
##   [4] BA000022.2   2086303      -
##   [5] BA000022.2   2645099      +
##   [6] BA000022.2   2825357      +
##   -------
##   seqinfo: 5 sequences from an unspecified genome; no seqlengths
# remove all variables which have same name within count_overlaps_xNt_upDownstream()
rm(positions)
rm(plus_strand)
rm(minus_strand)
rm(indices)
rm(strands)
rm(seqs)
rm(temp_GRanges)

4.7.2 For PSS which are upregulated in rne(Ts)

overlaps_PSS_down_start_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_down, syne_fasta, len = 300, start_stop = "start")
overlaps_PSS_down_stop_CDS <- count_overlaps_xNt_upDownstream(features[which(features$type=="CDS")], PSS_Ranges_down, syne_fasta, len = 300, start_stop = "stop")

overlaps_PSS_down_start_CDS_ggplot <- data.frame(t(overlaps_PSS_down_start_CDS))
overlaps_PSS_down_start_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_down_start_CDS_ggplot))
names(overlaps_PSS_down_start_CDS_ggplot)[1] <- "count"

overlaps_PSS_down_stop_CDS_ggplot <- data.frame(t(overlaps_PSS_down_stop_CDS))
overlaps_PSS_down_stop_CDS_ggplot$nt_pos <- as.numeric(row.names(overlaps_PSS_down_stop_CDS_ggplot))
names(overlaps_PSS_down_stop_CDS_ggplot)[1] <- "count"
p <- ggplot(overlaps_PSS_down_start_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#e69f00ff") + theme_light() + labs(x="Distance relative to start codon (nt)", y="Number of cleavage sites") + ylim(0,8)+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")
p

ggsave("output/distance_startStop/PSS_Down_count_relativeStartCodon.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(overlaps_PSS_down_stop_CDS_ggplot, aes(x=nt_pos, y=count)) + geom_line(size=0.5, color="#e69f00ff") + theme_light() + labs(x="Distance relative to stop codon (nt)", y="Number of cleavage sites") + ylim(0,8)+ geom_vline(aes(xintercept=0), linetype=2, color="darkgray")
p

ggsave("output/distance_startStop/PSS_Down_count_relativeStopCodon.pdf", plot=p, width=10, height=7, units="cm")
overlaps_PSS_down_start_CDS[which(overlaps_PSS_down_start_CDS==max(overlaps_PSS_down_start_CDS))]
##   -186 24 57 60 144 198 231
## 1    4  4  4  4   4   4   4
overlaps_PSS_down_stop_CDS[which(overlaps_PSS_down_stop_CDS==max(overlaps_PSS_down_stop_CDS))]
##   -179
## 1    8

4.8 Perform RNA Fold with sliding window approach

To investigate secondary structures around processing sites: Extract 150nt upstream / downstream of processing sites and calculate minimum free energy (\(\Delta\)G). Do this also for random peaks, start and end positions of TUs, start and end positions of CDS. Use sliding window approach to reduce problem of transcript boundaries.

4.8.1 Processing Sites

# export sequences from peaks + / - 150nt
peaks_all_150 <- extract_RNA_sequences(PSS_Ranges, syne_fasta, 150)
peaks_up_150 <- extract_RNA_sequences(PSS_Ranges_up, syne_fasta, 150)
peaks_down_150 <- extract_RNA_sequences(PSS_Ranges_down, syne_fasta, 150)

writeXStringSet(peaks_all_150, "output/RNAfold/all_peaks_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(peaks_up_150, "output/RNAfold/peaks_up_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(peaks_down_150, "output/RNAfold/peaks_down_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_up_150.fasta RNAfold/mfe_files/mfe_peaks_up_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_down_150.fasta RNAfold/mfe_files/mfe_peaks_down_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/all_peaks_150.fasta RNAfold/mfe_files/mfe_all_peaks_150_150.tsv 50

mfe_PSS_up <- read.delim("output/RNAfold/mfe_files/mfe_peaks_up_150.tsv", header=TRUE, row.names=1)
mfe_PSS_up_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_up_150_control.tsv", header=TRUE, row.names=1)
mfe_PSS_down <- read.delim("output/RNAfold/mfe_files/mfe_peaks_down_150.tsv", header=TRUE, row.names=1)
mfe_PSS_down_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_down_150_control.tsv", header=TRUE, row.names=1)

Original sequences represented nucleotide positions before (-150:-1) and after (+1:+150) cleavage position. With the floating window approach etc., this translates to position “150.5” (index 126).

df_mfe_PSS_quartiles <- data.frame(cbind(ntpos=c(rep(c(-125:125),2)), mfe_distrib=c(apply(mfe_PSS_up, 1, median), apply(mfe_PSS_down, 1, median),apply(mfe_PSS_up, 1, quantile,probs=0.25), apply(mfe_PSS_down, 1, quantile,probs=0.25),apply(mfe_PSS_up, 1,  quantile,probs=0.75), apply(mfe_PSS_down, 1,  quantile,probs=0.75))))
df_mfe_PSS_quartiles$updown <- rep(c(rep("up",251), rep("down", 251)),3)
df_mfe_PSS_quartiles$median_quartile <- c(rep("median", 502), rep("1quartile", 502), rep("3quartile",502))

mean_or_median = mean
df_mfe_PSS <- data.frame(cbind(ntpos=c(rep(c(-125:125),4)), mfe=c(apply(mfe_PSS_up, 1, mean_or_median), apply(mfe_PSS_down, 1, mean_or_median), apply(mfe_PSS_up_control, 1, mean_or_median), apply(mfe_PSS_down_control,1, mean_or_median))))
df_mfe_PSS$updown <- rep(c(rep("up",251), rep("down", 251)),2)
df_mfe_PSS$control <- c(rep("data", 502), rep("control", 502))

Plot with shuffled data and non-shuffled data

p <- ggplot(data=df_mfe_PSS, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_PSS_shuffled.pdf", plot=p, width=10, height=7, units="cm")

Plot with quartiles (1st, median, 3rd)

p <- ggplot(data=df_mfe_PSS_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-14,-2)
p

ggsave("output/RNAfold/figures/deltaG_PSS_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_PSS, df_mfe_PSS$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_PSS.pdf", plot=p, width=10, height=7, units="cm")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_up_150.fasta RNAfold/mfe_files/mfe_peaks_up_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/peaks_down_150.fasta RNAfold/mfe_files/mfe_peaks_down_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/all_peaks_150.fasta RNAfold/mfe_files/mfe_all_peaks_150_25nt.tsv 25

mfe_PSS_25nt_up <- read.delim("output/RNAfold/mfe_files/mfe_peaks_up_150_25nt.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_down <- read.delim("output/RNAfold/mfe_files/mfe_peaks_down_150_25nt.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_up_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_up_150_25nt_control.tsv", header=TRUE, row.names=1)
mfe_PSS_25nt_down_control <- read.delim("output/RNAfold/mfe_files/mfe_peaks_down_150_25nt_control.tsv", header=TRUE, row.names=1)

Processing site at position 150.5. When x-axis is labeled from -137.5:137.5, this corresponds to -0

df_mfe_PSS_25nt_quartiles <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),2)), mfe_distrib=c(apply(mfe_PSS_25nt_up, 1, median), apply(mfe_PSS_25nt_down, 1, median),apply(mfe_PSS_25nt_up, 1, quantile,probs=0.25), apply(mfe_PSS_25nt_down, 1, quantile,probs=0.25),apply(mfe_PSS_25nt_up, 1,  quantile,probs=0.75), apply(mfe_PSS_25nt_down, 1,  quantile,probs=0.75))))
df_mfe_PSS_25nt_quartiles$updown <- rep(c(rep("up",276), rep("down", 276)),3)
df_mfe_PSS_25nt_quartiles$median_quartile <- c(rep("median", 552), rep("1quartile", 552), rep("3quartile",552))

mean_or_median = mean
df_mfe_PSS_25nt <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),4)), mfe=c(apply(mfe_PSS_25nt_up, 1, mean_or_median), apply(mfe_PSS_25nt_down, 1, mean_or_median), apply(mfe_PSS_25nt_up_control, 1, mean_or_median), apply(mfe_PSS_25nt_down_control,1, mean_or_median))))
df_mfe_PSS_25nt$updown <- rep(c(rep("up",276), rep("down", 276)),2)
df_mfe_PSS_25nt$control <- c(rep("data", 552), rep("control", 552))
p <- ggplot(data=df_mfe_PSS_25nt, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(Ts)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_PSS_25nt_shuffled.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=df_mfe_PSS_25nt_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(Ts)")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-5.1,0.1)
p

ggsave("output/RNAfold/figures/deltaG_PSS_25nt_quartiles.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=subset(df_mfe_PSS_25nt, df_mfe_PSS_25nt$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)", "rne(Ts)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_PSS_25nt.pdf", plot=p, width=10, height=7, units="cm")

4.8.2 Random Sites

# use same amount of random positions
shuffle_Ranges_perPlasmid <- function(Ranges_object, sequence_object, seed=NULL){
  set.seed(seed)

  # extract lengths of sequences
  lengths <- c()
  for(i in 1:length(sequence_object)){
    lengths[i] <- width(sequence_object[i])
  }
  names(lengths) <- names(sequence_object)

  # initialise new_ranges, end has to be set according to length of plasmids / chromosome
  new_ranges <- Ranges_object
  start(new_ranges) <- rep(1, length(new_ranges))
  
  # do actual shuffling  
  for(seq_i in 1:length(lengths)){
    seq <- names(lengths)[seq_i]
    indices <- which(seqnames(new_ranges)==seq)
    # set ends to max possible length to avoid width < 0
    end(new_ranges[indices]) <- rep(lengths[seq], length(new_ranges[indices]))
    
    # create random numbers
    start(new_ranges[indices]) <- round(runif(length(new_ranges[indices]), 1, lengths[seq]))
    end(new_ranges[indices]) <- start(new_ranges[indices])
  }
  
  return(new_ranges)
}

random_all_Ranges <- shuffle_Ranges_perPlasmid(PSS_Ranges, syne_fasta, 42)
random_up_Ranges <- shuffle_Ranges_perPlasmid(PSS_Ranges_up, syne_fasta, 42)
random_down_Ranges <- shuffle_Ranges_perPlasmid(PSS_Ranges_down, syne_fasta, 42)

random_all_150 <- extract_RNA_sequences(random_all_Ranges, syne_fasta, 150)
random_up_150 <- extract_RNA_sequences(random_up_Ranges, syne_fasta, 150)
random_down_150 <- extract_RNA_sequences(random_down_Ranges, syne_fasta, 150)

writeXStringSet(random_all_150, "output/RNAfold/random_all_peaks_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(random_up_150, "output/RNAfold/random_up_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(random_down_150, "output/RNAfold/random_down_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_up_150.fasta RNAfold/mfe_files/mfe_random_up_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_down_150.fasta RNAfold/mfe_files/mfe_random_down_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_all_peaks_150.fasta RNAfold/mfe_files/mfe_random_all_peaks_150.tsv 50

mfe_random_up <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150.tsv", header=TRUE, row.names=1)
mfe_random_up_control <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_control.tsv", header=TRUE, row.names=1)
mfe_random_down <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150.tsv", header=TRUE, row.names=1)
mfe_random_down_control <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_control.tsv", header=TRUE, row.names=1)
df_mfe_random_quartiles <- data.frame(cbind(ntpos=c(rep(c(-125:125),2)), mfe_distrib=c(apply(mfe_random_up, 1, median), apply(mfe_random_down, 1, median),apply(mfe_random_up, 1, quantile,probs=0.25), apply(mfe_random_down, 1, quantile,probs=0.25),apply(mfe_random_up, 1,  quantile,probs=0.75), apply(mfe_random_down, 1,  quantile,probs=0.75))))
df_mfe_random_quartiles$updown <- rep(c(rep("up",251), rep("down", 251)),3)
df_mfe_random_quartiles$median_quartile <- c(rep("median", 502), rep("1quartile", 502), rep("3quartile",502))

mean_or_median = mean
df_mfe_random <- data.frame(cbind(ntpos=c(rep(c(-125:125),4)), mfe=c(apply(mfe_random_up, 1, mean_or_median), apply(mfe_random_down, 1, mean_or_median), apply(mfe_random_up_control, 1, mean_or_median), apply(mfe_random_down_control,1, mean_or_median))))
df_mfe_random$updown <- rep(c(rep("up",251), rep("down", 251)),2)
df_mfe_random$control <- c(rep("data", 502), rep("control", 502))

Plot with shuffled data and non-shuffled data

p <- ggplot(data=df_mfe_random, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_random_shuffled.pdf", plot=p, width=10, height=7, units="cm")

Plot with quartiles (1st, median, 3rd)

p <- ggplot(data=df_mfe_random_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-14,-2)
p

ggsave("output/RNAfold/figures/deltaG_random_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_random, df_mfe_PSS$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_random.pdf", plot=p, width=10, height=7, units="cm")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_up_150.fasta RNAfold/mfe_files/mfe_random_up_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_down_150.fasta RNAfold/mfe_files/mfe_random_down_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/random_all_peaks_150.fasta RNAfold/mfe_files/mfe_all_random_150_25nt.tsv 25

mfe_random_25nt_up <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_25nt.tsv", header=TRUE, row.names=1)
mfe_random_25nt_down <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_25nt.tsv", header=TRUE, row.names=1)
mfe_random_25nt_up_control <- read.delim("output/RNAfold/mfe_files/mfe_random_up_150_25nt_control.tsv", header=TRUE, row.names=1)
mfe_random_25nt_down_control <- read.delim("output/RNAfold/mfe_files/mfe_random_down_150_25nt_control.tsv", header=TRUE, row.names=1)

Processing site at position 150.5. When x-axis is labeled from -137.5:137.5, this corresponds to -0

df_mfe_random_25nt_quartiles <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),2)), mfe_distrib=c(apply(mfe_random_25nt_up, 1, median), apply(mfe_random_25nt_down, 1, median),apply(mfe_random_25nt_up, 1, quantile,probs=0.25), apply(mfe_random_25nt_down, 1, quantile,probs=0.25),apply(mfe_random_25nt_up, 1,  quantile,probs=0.75), apply(mfe_random_25nt_down, 1,  quantile,probs=0.75))))
df_mfe_random_25nt_quartiles$updown <- rep(c(rep("up",276), rep("down", 276)),3)
df_mfe_random_25nt_quartiles$median_quartile <- c(rep("median", 552), rep("1quartile", 552), rep("3quartile",552))

mean_or_median = mean
df_mfe_random_25nt <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),4)), mfe=c(apply(mfe_random_25nt_up, 1, mean_or_median), apply(mfe_random_25nt_down, 1, mean_or_median), apply(mfe_random_25nt_up_control, 1, mean_or_median), apply(mfe_random_25nt_down_control,1, mean_or_median))))
df_mfe_random_25nt$updown <- rep(c(rep("up",276), rep("down", 276)),2)
df_mfe_random_25nt$control <- c(rep("data", 552), rep("control", 552))
p <- ggplot(data=df_mfe_random_25nt, aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_random_25nt_shuffled.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=df_mfe_random_25nt_quartiles, aes(x=ntpos, y=mfe_distrib, color=updown, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-5.1,0.1)
p

ggsave("output/RNAfold/figures/deltaG_random_25nt_quartiles.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=subset(df_mfe_random_25nt, df_mfe_random_25nt$control=="data"), aes(x=ntpos, y=mfe, color=updown)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks", breaks=c("up", "down"), labels=c("rne(WT)>Ts", "rne(Ts)>WT")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_random_25nt.pdf", plot=p, width=10, height=7, units="cm")

4.8.3 TUs

# on plus strand: beginning of feature is start(), on minus strand: beginning of feature is end()
plus_indices <- which(strand(TUs)=="+")
minus_indices <- which(strand(TUs)=="-")

TU_starts <- TUs
end(TU_starts[plus_indices]) <- start(TU_starts[plus_indices])
start(TU_starts[minus_indices]) <- end(TU_starts[minus_indices])

TU_ends <- TUs
start(TU_ends[plus_indices]) <- end(TU_ends[plus_indices])
end(TU_ends[minus_indices]) <- start(TU_ends[minus_indices])

TU_starts_sequences <- extract_RNA_sequences(TU_starts, syne_fasta, 150)
TU_ends_sequences <- extract_RNA_sequences(TU_ends, syne_fasta, 150)

writeXStringSet(TU_starts_sequences, "output/RNAfold/TU_starts_sequences_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(TU_ends_sequences, "output/RNAfold/TU_ends_sequences_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/TU_starts_sequences_150.fasta RNAfold/mfe_files/mfe_TU_starts_sequences_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/TU_ends_sequences_150.fasta RNAfold/mfe_files/mfe_TU_ends_sequences_150.tsv 50

mfe_TUs_start <- read.delim("output/RNAfold/mfe_files/mfe_TU_starts_sequences_150.tsv", header=TRUE, row.names=1)
mfe_TUs_start_control <- read.delim("output/RNAfold/mfe_files/mfe_TU_starts_sequences_150_control.tsv", header=TRUE, row.names=1)
mfe_TUs_stop <- read.delim("output/RNAfold/mfe_files/mfe_TU_ends_sequences_150.tsv", header=TRUE, row.names=1)
mfe_TUs_stop_control <- read.delim("output/RNAfold/mfe_files/mfe_TU_ends_sequences_150_control.tsv", header=TRUE, row.names=1)
df_mfe_TUs_quartiles <- data.frame(cbind(ntpos=c(rep(c(-125:125),2)), mfe_distrib=c(apply(mfe_TUs_start, 1, median), apply(mfe_TUs_stop, 1, median),apply(mfe_TUs_start, 1, quantile,probs=0.25), apply(mfe_TUs_stop, 1, quantile,probs=0.25),apply(mfe_TUs_start, 1,  quantile,probs=0.75), apply(mfe_TUs_stop, 1,  quantile,probs=0.75))))
df_mfe_TUs_quartiles$startstop <- rep(c(rep("start",251), rep("stop", 251)),3)
df_mfe_TUs_quartiles$median_quartile <- c(rep("median", 502), rep("1quartile", 502), rep("3quartile",502))

mean_or_median = mean
df_mfe_TUs <- data.frame(cbind(ntpos=c(rep(c(-125:125),4)), mfe=c(apply(mfe_TUs_start, 1, mean_or_median), apply(mfe_TUs_stop, 1, mean_or_median), apply(mfe_TUs_start_control, 1, mean_or_median), apply(mfe_TUs_stop_control,1, mean_or_median))))
df_mfe_TUs$startstop <- rep(c(rep("start",251), rep("stop", 251)),2)
df_mfe_TUs$control <- c(rep("data", 502), rep("control", 502))

Plot with shuffled data and non-shuffled data

p <- ggplot(data=df_mfe_TUs, aes(x=ntpos, y=mfe, color=startstop)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + xlab("Distance Relative to Start / Stop position (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_TUs_shuffled.pdf", plot=p, width=10, height=7, units="cm")

Plot with quartiles (1st, median, 3rd)

p <- ggplot(data=df_mfe_TUs_quartiles, aes(x=ntpos, y=mfe_distrib, color=startstop, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Start / Stop position (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-14,-2)
p

ggsave("output/RNAfold/figures/deltaG_TUs_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_TUs, df_mfe_TUs$control=="data"), aes(x=ntpos, y=mfe, color=startstop)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + xlab("Distance Relative to Start / Stop position (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_TUs.pdf", plot=p, width=10, height=7, units="cm")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/TU_starts_sequences_150.fasta RNAfold/mfe_files/mfe_TU_starts_sequences_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/TU_ends_sequences_150.fasta RNAfold/mfe_files/mfe_TU_ends_sequences_150_25nt.tsv 25

mfe_TUs_25nt_start <- read.delim("output/RNAfold/mfe_files/mfe_TU_starts_sequences_150_25nt.tsv", header=TRUE, row.names=1)
mfe_TUs_25nt_stop <- read.delim("output/RNAfold/mfe_files/mfe_TU_ends_sequences_150_25nt.tsv", header=TRUE, row.names=1)
mfe_TUs_25nt_start_control <- read.delim("output/RNAfold/mfe_files/mfe_TU_starts_sequences_150_25nt_control.tsv", header=TRUE, row.names=1)
mfe_TUs_25nt_stop_control <- read.delim("output/RNAfold/mfe_files/mfe_TU_ends_sequences_150_25nt_control.tsv", header=TRUE, row.names=1)
df_mfe_TUs_25nt_quartiles <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),2)), mfe_distrib=c(apply(mfe_TUs_25nt_start, 1, median), apply(mfe_TUs_25nt_stop, 1, median),apply(mfe_TUs_25nt_start, 1, quantile,probs=0.25), apply(mfe_TUs_25nt_stop, 1, quantile,probs=0.25),apply(mfe_TUs_25nt_start, 1,  quantile,probs=0.75), apply(mfe_TUs_25nt_stop, 1,  quantile,probs=0.75))))
df_mfe_TUs_25nt_quartiles$startstop <- rep(c(rep("start",276), rep("stop", 276)),3)
df_mfe_TUs_25nt_quartiles$median_quartile <- c(rep("median", 552), rep("1quartile", 552), rep("3quartile",552))

mean_or_median = mean
df_mfe_TUs_25nt <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),4)), mfe=c(apply(mfe_TUs_25nt_start, 1, mean_or_median), apply(mfe_TUs_25nt_stop, 1, mean_or_median), apply(mfe_TUs_25nt_start_control, 1, mean_or_median), apply(mfe_TUs_25nt_stop_control,1, mean_or_median))))
df_mfe_TUs_25nt$startstop <- rep(c(rep("start",276), rep("stop", 276)),2)
df_mfe_TUs_25nt$control <- c(rep("data", 552), rep("control", 552))
p <- ggplot(data=df_mfe_TUs_25nt, aes(x=ntpos, y=mfe, color=startstop)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_TUs_25nt_shuffled.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=df_mfe_TUs_25nt_quartiles, aes(x=ntpos, y=mfe_distrib, color=startstop, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-5.1,0.1)
p

ggsave("output/RNAfold/figures/deltaG_TUs_25nt_quartiles.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=subset(df_mfe_TUs_25nt, df_mfe_random_25nt$control=="data"), aes(x=ntpos, y=mfe, color=startstop)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_TUs_25nt.pdf", plot=p, width=10, height=7, units="cm")

4.8.4 CDS

# on plus strand: beginning of feature is start(), on minus strand: beginning of feature is end()
plus_indices <- which(strand(CDS_features)=="+")
minus_indices <- which(strand(CDS_features)=="-")

CDS_starts <- CDS_features
end(CDS_starts[plus_indices]) <- start(CDS_starts[plus_indices])
start(CDS_starts[minus_indices]) <- end(CDS_starts[minus_indices])

CDS_ends <- CDS_features
start(CDS_ends[plus_indices]) <- end(CDS_ends[plus_indices])
end(CDS_ends[minus_indices]) <- start(CDS_ends[minus_indices])

CDS_starts_sequences <- extract_RNA_sequences(CDS_starts, syne_fasta, 150)
CDS_ends_sequences <- extract_RNA_sequences(CDS_ends, syne_fasta, 150)

writeXStringSet(CDS_starts_sequences, "output/RNAfold/CDS_starts_sequences_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")
writeXStringSet(CDS_ends_sequences, "output/RNAfold/CDS_ends_sequences_150.fasta", append=FALSE,
                compress=FALSE, compression_level=NA, format="fasta")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/CDS_starts_sequences_150.fasta RNAfold/mfe_files/mfe_CDS_starts_sequences_150.tsv 50 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/CDS_ends_sequences_150.fasta RNAfold/mfe_files/mfe_CDS_ends_sequences_150.tsv 50

mfe_CDS_start <- read.delim("output/RNAfold/mfe_files/mfe_CDS_starts_sequences_150.tsv", header=TRUE, row.names=1)
mfe_CDS_start_control <- read.delim("output/RNAfold/mfe_files/mfe_CDS_starts_sequences_150_control.tsv", header=TRUE, row.names=1)
mfe_CDS_stop <- read.delim("output/RNAfold/mfe_files/mfe_CDS_ends_sequences_150.tsv", header=TRUE, row.names=1)
mfe_CDS_stop_control <- read.delim("output/RNAfold/mfe_files/mfe_CDS_ends_sequences_150_control.tsv", header=TRUE, row.names=1)
df_mfe_CDS_quartiles <- data.frame(cbind(ntpos=c(rep(c(-125:125),2)), mfe_distrib=c(apply(mfe_CDS_start, 1, median), apply(mfe_CDS_stop, 1, median),apply(mfe_CDS_start, 1, quantile,probs=0.25), apply(mfe_CDS_stop, 1, quantile,probs=0.25),apply(mfe_CDS_start, 1,  quantile,probs=0.75), apply(mfe_CDS_stop, 1,  quantile,probs=0.75))))
df_mfe_CDS_quartiles$startstop <- rep(c(rep("start",251), rep("stop", 251)),3)
df_mfe_CDS_quartiles$median_quartile <- c(rep("median", 502), rep("1quartile", 502), rep("3quartile",502))

mean_or_median = mean
df_mfe_CDS <- data.frame(cbind(ntpos=c(rep(c(-125:125),4)), mfe=c(apply(mfe_CDS_start, 1, mean_or_median), apply(mfe_CDS_stop, 1, mean_or_median), apply(mfe_CDS_start_control, 1, mean_or_median), apply(mfe_CDS_stop_control,1, mean_or_median))))
df_mfe_CDS$startstop <- rep(c(rep("start",251), rep("stop", 251)),2)
df_mfe_CDS$control <- c(rep("data", 502), rep("control", 502))

Plot with shuffled data and non-shuffled data

p <- ggplot(data=df_mfe_CDS, aes(x=ntpos, y=mfe, color=startstop)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + xlab("Distance Relative to Start / Stop position (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_CDS_shuffled.pdf", plot=p, width=10, height=7, units="cm")

Plot with quartiles (1st, median, 3rd)

p <- ggplot(data=df_mfe_CDS_quartiles, aes(x=ntpos, y=mfe_distrib, color=startstop, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Start / Stop position (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-14,-2)
p

ggsave("output/RNAfold/figures/deltaG_CDS_quartiles.pdf", plot=p, width=10, height=7, units="cm")
p <- ggplot(data=subset(df_mfe_CDS, df_mfe_CDS$control=="data"), aes(x=ntpos, y=mfe, color=startstop)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + xlab("Distance Relative to Start / Stop position (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-10,-4.75)
p

ggsave("output/RNAfold/figures/deltaG_CDS.pdf", plot=p, width=10, height=7, units="cm")

run from command line (!takes a long time!): python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/CDS_starts_sequences_150.fasta RNAfold/mfe_files/mfe_CDS_starts_sequences_150_25nt.tsv 25 python RNAfold/RNAfold_inputFasta_outputMfeTable.py RNAfold/CDS_ends_sequences_150.fasta RNAfold/mfe_files/mfe_CDS_ends_sequences_150_25nt.tsv 25

mfe_CDS_25nt_start <- read.delim("output/RNAfold/mfe_files/mfe_CDS_starts_sequences_150_25nt.tsv", header=TRUE, row.names=1)
mfe_CDS_25nt_stop <- read.delim("output/RNAfold/mfe_files/mfe_CDS_ends_sequences_150_25nt.tsv", header=TRUE, row.names=1)
mfe_CDS_25nt_start_control <- read.delim("output/RNAfold/mfe_files/mfe_CDS_starts_sequences_150_25nt_control.tsv", header=TRUE, row.names=1)
mfe_CDS_25nt_stop_control <- read.delim("output/RNAfold/mfe_files/mfe_CDS_ends_sequences_150_25nt_control.tsv", header=TRUE, row.names=1)
df_mfe_CDS_25nt_quartiles <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),2)), mfe_distrib=c(apply(mfe_CDS_25nt_start, 1, median), apply(mfe_CDS_25nt_stop, 1, median),apply(mfe_CDS_25nt_start, 1, quantile,probs=0.25), apply(mfe_CDS_25nt_stop, 1, quantile,probs=0.25),apply(mfe_CDS_25nt_start, 1,  quantile,probs=0.75), apply(mfe_CDS_25nt_stop, 1,  quantile,probs=0.75))))
df_mfe_CDS_25nt_quartiles$startstop <- rep(c(rep("start",276), rep("stop", 276)),3)
df_mfe_CDS_25nt_quartiles$median_quartile <- c(rep("median", 552), rep("1quartile", 552), rep("3quartile",552))

mean_or_median = mean
df_mfe_CDS_25nt <- data.frame(cbind(ntpos=c(rep(c(-137.5:137.5),4)), mfe=c(apply(mfe_CDS_25nt_start, 1, mean_or_median), apply(mfe_CDS_25nt_stop, 1, mean_or_median), apply(mfe_CDS_25nt_start_control, 1, mean_or_median), apply(mfe_CDS_25nt_stop_control,1, mean_or_median))))
df_mfe_CDS_25nt$startstop <- rep(c(rep("start",276), rep("stop", 276)),2)
df_mfe_CDS_25nt$control <- c(rep("data", 552), rep("control", 552))
p <- ggplot(data=df_mfe_CDS_25nt, aes(x=ntpos, y=mfe, color=startstop)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(aes(linetype=control), size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_CDS_25nt_shuffled.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=df_mfe_CDS_25nt_quartiles, aes(x=ntpos, y=mfe_distrib, color=startstop, linetype=median_quartile)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + 
  scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + 
  scale_linetype_manual(name="Quartiles",values=c(3,1,3), breaks=c("1quartile", "median", "3quartile"), labels=c("1st", "2nd", "3rd")) +
  xlab("Distance Relative to Cleavage Site (nt)") + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) + ylim(-5.1,0.1)
p

ggsave("output/RNAfold/figures/deltaG_CDS_25nt_quartiles.pdf", plot=p, width=10, height=7, units="cm")

p <- ggplot(data=subset(df_mfe_CDS_25nt, df_mfe_random_25nt$control=="data"), aes(x=ntpos, y=mfe, color=startstop)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_linetype_manual(values=c(4,1), name="Control", breaks=c("control", "data"), labels=c("shuffled", "data")) + scale_color_manual(values=c("black", "darkgray"), name  ="Position", breaks=c("start", "stop"), labels=c("Start", "End")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab(expression(Delta*"G")) + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(-3.4,-1)
p

ggsave("output/RNAfold/figures/deltaG_CDS_25nt.pdf", plot=p, width=10, height=7, units="cm")

4.9 For AU content, ACGU content

Check for nucleotide content around processing sites, compare between different sets of processing sites.

length_up_down <- 50
peaks_all <- extract_RNA_sequences(PSS_Ranges, syne_fasta, length_up_down)
peaks_up <- extract_RNA_sequences(PSS_Ranges_up, syne_fasta, length_up_down)
peaks_down <- extract_RNA_sequences(PSS_Ranges_down, syne_fasta, length_up_down)

cons_mat <- consensusMatrix(peaks_up)
perc_AU_up <- (cons_mat[1,]+cons_mat[4,])/apply(cons_mat[1:4,], 2,sum)
df_perc_ACGU_up <- data.frame(cons_mat[1:4,])/sum(cons_mat[,1])*100
df_perc_ACGU_up <- data.frame(cbind(ntpos=c(-length_up_down:-1,1:length_up_down), t(df_perc_ACGU_up)))
df_perc_AU_up <- df_perc_ACGU_up[,c(1,2,5)]
df_perc_CG_up <- df_perc_ACGU_up[,c(1,3,4)]
df_perc_ACGU_up <- pivot_longer(df_perc_ACGU_up, 2:5)
df_perc_AU_up <- pivot_longer(df_perc_AU_up, 2:3)
df_perc_CG_up <- pivot_longer(df_perc_CG_up, 2:3)

cons_mat <- consensusMatrix(peaks_down)
perc_AU_down <- (cons_mat[1,]+cons_mat[4,])/apply(cons_mat[1:4,], 2,sum)
df_perc_ACGU_down <- data.frame(cons_mat[1:4,])/sum(cons_mat[,1])*100
df_perc_ACGU_down <- data.frame(cbind(ntpos=c(-length_up_down:-1,1:length_up_down), t(df_perc_ACGU_down)))
df_perc_AU_down <- df_perc_ACGU_down[,c(1,2,5)]
df_perc_CG_down <- df_perc_ACGU_down[,c(1,3,4)]
df_perc_ACGU_down <- pivot_longer(df_perc_ACGU_down, 2:5)
df_perc_AU_down <- pivot_longer(df_perc_AU_down, 2:3)
df_perc_CG_down <- pivot_longer(df_perc_CG_down, 2:3)

df_percAU <- data.frame(cbind(ntpos=rep(c(-length_up_down:-1,1:length_up_down),2),percAU=c(perc_AU_up, perc_AU_down)*100))
df_percAU$updown <- c(rep("up",2*length_up_down), rep("down", 2*length_up_down))

general_AU <- (sum(oligonucleotideFrequency(syne_fasta, width=1)[,c(1,4)])/sum(oligonucleotideFrequency(syne_fasta, width=1)))*100

ACGU_colors <- palette_OkabeIto[c(3,1,5,6)]
names(ACGU_colors) <- c("U", "G", "C", "A")

Compare AU content between peak positions which are either up- or downregulated

p <- ggplot(data=df_percAU, aes(x=ntpos, y=percAU, color=updown)) + geom_hline(aes(yintercept=general_AU), color="darkgray", linetype=2) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=.5) + theme_light() + 
  scale_color_manual(values=c("#005a96ff", "#e69f00ff"), name  ="Peaks",
                           breaks=c("up", "down"),
                           labels=c("rne(WT)>rne(Ts)", "rne(Ts)>rne(WT)")) + xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage AU (%)")
  
p <- p + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5))

ggsave("output/nt_content_plots/AUcontent.pdf", plot=p, width=10, height=7, units="cm")

Code for disinguishing nts:

p <- ggplot(data=df_perc_ACGU_up, aes(x=ntpos, y=value, color=name)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=0.5) + theme_light() + scale_color_manual(name="nt", values=ACGU_colors)+ xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage ACGU (%)") + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5)) +ylim(5,72)+xlim(-25,+25)
p
## Warning: Removed 200 row(s) containing missing values (geom_path).

ggsave("output/nt_content_plots/ACGU_up_content.pdf", plot=p, width=10, height=7, units="cm")
## Warning: Removed 200 row(s) containing missing values (geom_path).
p <- ggplot(data=df_perc_ACGU_down, aes(x=ntpos, y=value, color=name)) + geom_vline(aes(xintercept=0), linetype=2, color="darkgray") + geom_line(size=.5) + theme_light() + scale_color_manual(name="nt", values=ACGU_colors)+ xlab("Distance Relative to Cleavage Site (nt)") + ylab("Percentage ACGU (%)") + theme(legend.justification=c(1,1), legend.position=c(0.95,0.95), legend.background = element_rect(colour="gray", linetype="solid", fill="white", size=.5))+ylim(5,72)+xlim(-25,+25)
p
## Warning: Removed 200 row(s) containing missing values (geom_path).

ggsave("output/nt_content_plots/ACGU_down_content.pdf", plot=p, width=10, height=7, units="cm")
## Warning: Removed 200 row(s) containing missing values (geom_path).

Session info

## R version 4.0.4 (2021-02-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 11 (bullseye)
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=de_DE.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=de_DE.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=de_DE.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] clusterProfiler_3.18.1      colorblindr_0.1.0          
##  [3] colorspace_2.0-0            viridis_0.5.1              
##  [5] viridisLite_0.3.0           rtracklayer_1.50.0         
##  [7] Biostrings_2.58.0           XVector_0.30.0             
##  [9] vsn_3.58.0                  RColorBrewer_1.1-2         
## [11] pheatmap_1.0.12             ggpubr_0.4.0               
## [13] ggrepel_0.9.1               edgeR_3.32.1               
## [15] limma_3.46.0                DESeq2_1.30.1              
## [17] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [19] MatrixGenerics_1.2.1        matrixStats_0.58.0         
## [21] GenomicRanges_1.42.0        GenomeInfoDb_1.26.2        
## [23] IRanges_2.24.1              S4Vectors_0.28.1           
## [25] BiocGenerics_0.36.0         forcats_0.5.1              
## [27] stringr_1.4.0               dplyr_1.0.5                
## [29] purrr_0.3.4                 readr_1.4.0                
## [31] tidyr_1.1.3                 tibble_3.1.0               
## [33] ggplot2_3.3.3               tidyverse_1.3.0            
## [35] knitr_1.31                 
## 
## loaded via a namespace (and not attached):
##   [1] shadowtext_0.0.7         readxl_1.3.1             backports_1.2.1         
##   [4] fastmatch_1.1-0          igraph_1.2.6             plyr_1.8.6              
##   [7] splines_4.0.4            BiocParallel_1.24.1      digest_0.6.27           
##  [10] htmltools_0.5.1.1        GOSemSim_2.16.1          GO.db_3.12.1            
##  [13] fansi_0.4.2              magrittr_2.0.1           memoise_2.0.0           
##  [16] openxlsx_4.2.3           graphlayouts_0.7.1       annotate_1.68.0         
##  [19] modelr_0.1.8             enrichplot_1.10.2        blob_1.2.1              
##  [22] rvest_1.0.0              haven_2.3.1              xfun_0.22               
##  [25] crayon_1.4.1             RCurl_1.98-1.2           jsonlite_1.7.2          
##  [28] scatterpie_0.1.5         genefilter_1.72.1        survival_3.2-7          
##  [31] glue_1.4.2               polyclip_1.10-0          gtable_0.3.0            
##  [34] zlibbioc_1.36.0          DelayedArray_0.16.2      car_3.0-10              
##  [37] abind_1.4-5              scales_1.1.1             DOSE_3.16.0             
##  [40] DBI_1.1.1                rstatix_0.7.0            Rcpp_1.0.6              
##  [43] xtable_1.8-4             foreign_0.8-81           bit_4.0.4               
##  [46] preprocessCore_1.52.1    httr_1.4.2               fgsea_1.16.0            
##  [49] ellipsis_0.3.1           farver_2.1.0             pkgconfig_2.0.3         
##  [52] XML_3.99-0.5             sass_0.3.1               dbplyr_2.1.0            
##  [55] locfit_1.5-9.4           utf8_1.1.4               labeling_0.4.2          
##  [58] tidyselect_1.1.0         rlang_0.4.10             reshape2_1.4.4          
##  [61] AnnotationDbi_1.52.0     munsell_0.5.0            cellranger_1.1.0        
##  [64] tools_4.0.4              cachem_1.0.4             downloader_0.4          
##  [67] cli_2.3.1                generics_0.1.0           RSQLite_2.2.3           
##  [70] broom_0.7.5              evaluate_0.14            fastmap_1.1.0           
##  [73] yaml_2.2.1               bit64_4.0.5              fs_1.5.0                
##  [76] tidygraph_1.2.0          zip_2.1.1                ggraph_2.0.5            
##  [79] DO.db_2.9                xml2_1.3.2               compiler_4.0.4          
##  [82] rstudioapi_0.13          curl_4.3                 affyio_1.60.0           
##  [85] ggsignif_0.6.1           reprex_1.0.0             tweenr_1.0.1            
##  [88] geneplotter_1.68.0       bslib_0.2.4              stringi_1.5.3           
##  [91] highr_0.8                lattice_0.20-41          Matrix_1.3-2            
##  [94] vctrs_0.3.6              pillar_1.5.1             lifecycle_1.0.0         
##  [97] BiocManager_1.30.10      jquerylib_0.1.3          cowplot_1.1.1           
## [100] data.table_1.14.0        bitops_1.0-6             qvalue_2.22.0           
## [103] R6_2.5.0                 affy_1.68.0              gridExtra_2.3           
## [106] rio_0.5.26               MASS_7.3-53.1            assertthat_0.2.1        
## [109] withr_2.4.1              GenomicAlignments_1.26.0 Rsamtools_2.6.0         
## [112] GenomeInfoDbData_1.2.4   hms_1.0.0                grid_4.0.4              
## [115] rvcheck_0.1.8            rmarkdown_2.7            carData_3.0-4           
## [118] ggforce_0.3.3            lubridate_1.7.10

References

Chao, Yanjie, Lei Li, Dylan Girodat, Konrad U Förstner, Nelly Said, Colin Corcoran, Michał Śmiga, et al. 2017. “In Vivo Cleavage Map Illuminates the Central Role of Rnase E in Coding and Non-Coding Rna Pathways.” Molecular Cell 65 (1): 39–51.

Förstner, Konrad U, Carina M Reuscher, Kerstin Haberzettl, Lennart Weber, and Gabriele Klug. 2018. “RNase E Cleavage Shapes the Transcriptome of Rhodobacter Sphaeroides and Strongly Impacts Phototrophic Growth.” Life Science Alliance 1 (4). https://doi.org/10.26508/lsa.201800080.

Innocenti, Nicolas, Monica Golumbeanu, Aymeric Fouquier d’Hérouël, Caroline Lacoux, Rémy A. Bonnin, Sean P. Kennedy, Françoise Wessner, et al. 2015. “Whole-Genome Mapping of 5’ Rna Ends in Bacteria by Tagged Sequencing: A Comprehensive View in Enterococcus Faecalis.” RNA 21 (5): 1018–30. https://doi.org/10.1261/rna.048470.114.

Kopf, M., S. Klähn, I. Scholz, J. K. F. Matthiessen, W. R. Hess, and B. Voss. 2014. “Comparative Analysis of the Primary Transcriptome of Synechocystis Sp. PCC 6803.” DNA Research 21 (5): 527–39. https://doi.org/10.1093/dnares/dsu018.

Sakurai, Isamu, Damir Stazic, Marion Eisenhut, Eerika Vuorio, Claudia Steglich, Wolfgang R. Hess, and Eva-Mari Aro. 2012. “Positive Regulation of psbA Gene Expression by Cis-Encoded Antisense RNAs in Synechocystis Sp. PCC 6803.” Plant Physiology 160 (2): 1000–1010. https://doi.org/10.1104/pp.112.202127.